diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 4f2bffd2e..19ca30144 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -40,7 +40,7 @@ jobs: pip install . - name: Test with flake8 run: | - flake8 . --ignore=E203,W503,W504,E,F403,F405 + flake8 . --ignore=E203,W503,W504,E,F403,F405 --exclude build if: matrix.python-version != '2.7' - name: Test with pytest run: | diff --git a/Makefile b/Makefile index a6bbb5ddc..1a9b5244f 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ clean: rm -rf Tests/__pycache__ PyPDF2/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf PyPDF2.egg-info PyPDF2_pdfLocation.txt test: - pytest Tests --cov --cov-report term-missing -vv --cov-report html + pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 mutation-test: mutmut run diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py index 5d85cbc4f..07f211c2b 100644 --- a/PyPDF2/__init__.py +++ b/PyPDF2/__init__.py @@ -1,7 +1,7 @@ -from .pdf import PdfFileReader, PdfFileWriter +from ._version import __version__ from .merger import PdfFileMerger from .pagerange import PageRange, parse_filename_page_ranges -from ._version import __version__ +from .pdf import PdfFileReader, PdfFileWriter __all__ = [ "__version__", diff --git a/PyPDF2/constants.py b/PyPDF2/constants.py new file mode 100644 index 000000000..77fc02843 --- /dev/null +++ b/PyPDF2/constants.py @@ -0,0 +1,186 @@ +""" +See Portable Document Format Reference Manual, 1993. ISBN 0-201-62628-4. + +See https://ia802202.us.archive.org/8/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf + +PDF Reference, third edition, Version 1.4, 2001. ISBN 0-201-75839-3. + +PDF Reference, sixth edition, Version 1.7, 2006. +""" + + +class PagesAttributes: + """Page Attributes, Table 6.2, Page 52""" + + TYPE = "/Type" # name, required; must be /Pages + KIDS = "/Kids" # array, required; List of indirect references + COUNT = "/Count" # integer, required; the number of all nodes und this node + PARENT = "/Parent" # dictionary, required; indirect reference to pages object + + +class PageAttributes: + """Page attributes, Table 6.3, Page 53""" + + TYPE = "/Type" # name, required; must be /Page + MEDIABOX = "/MediaBox" # array, required; rectangle specifying page size + PARENT = "/Parent" # dictionary, required; a pages object + RESOURCES = "/Resources" # dictionary, required if there are any + CONTENTS = "/Contents" # stream or array, optional + CROPBOX = "/CropBox" # array, optional; rectangle + ROTATE = "/Rotate" # integer, optional; page rotation in degrees + THUMB = "/Thumb" # stream, optional; indirect reference to image of the page + ANNOTS = "/Annots" # array, optional; an array of annotations + + +class Ressources: + PROCSET = "/ProcSet" # Chapter 6.8.1 + FONT = "/Font" # Chapter 6.8.2 + # encoding + # font descriptors : 6.8.4 + COLOR_SPACE = "/ColorSpace" # Chapter 6.8.5 + XOBJECT = "/XObject" # Chapter 6.8.6 + + +class StreamAttributes: + """Table 4.2""" + + LENGTH = "/Length" # integer, required + FILTER = "/Filter" # name or array of names, optional + DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong + + +class FilterTypes: + """ + Table 4.3 of the 1.4 Manual + + Page 354 of the 1.7 Manual + """ + + ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx + ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85 + LZW_DECODE = "/LZWDecode" # abbreviation: LZW + FLATE_DECODE = "/FlateDecode" # abbreviation: Fl, PDF 1.2 + RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL + CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF + DCT_DECODE = "/DCTDecode" # abbreviation: DCT + + +class FilterTypeAbbreviations: + """ + Table 4.44 of the 1.7 Manual (page 353ff) + """ + + AHx = "/AHx" + A85 = "/A85" + LZW = "/LZW" + FL = "/Fl" # FlateDecode + RL = "/RL" + CCF = "/CCF" + DCT = "/DCT" + + +class LzwFilterParameters: + """Table 4.4""" + + PREDICTOR = "/Predictor" # integer + COLUMNS = "/Columns" # integer + COLORS = "/Colors" # integer + BITS_PER_COMPONENT = "/BitsPerComponent" # integer + EARLY_CHANGE = "/EarlyChange" # integer + + +class CcittFaxDecodeParameters: + """Table 4.5""" + + K = "/K" # integer + END_OF_LINE = "/EndOfLine" # boolean + ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean + COLUMNS = "/Columns" # integer + ROWS = "/Rows" # integer + END_OF_BLOCK = "/EndOfBlock" # boolean + BLACK_IS_1 = "/BlackIs1" # boolean + DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer + + +class ImageAttributes: + """Table 6.20.""" + + TYPE = "/Type" # name, required; must be /XObject + SUBTYPE = "/Subtype" # name, required; must be /Image + NAME = "/Name" # name, required + WIDTH = "/Width" # integer, required + HEIGHT = "/Height" # integer, required + BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required + COLOR_SPACE = "/ColorSpace" # name, required + DECODE = "/Decode" # array, optional + INTERPOLATE = "/Interpolate" # boolean, optional + IMAGE_MASK = "/ImageMask" # boolean, optional + + +class ColorSpaces: + DEVICE_RGB = "/DeviceRGB" + DEVICE_CMYK = "/DeviceCMYK" + DEVICE_GRAY = "/DeviceGray" + + +class TypArguments: + """Table 8.2 of the PDF 1.7 reference""" + + LEFT = "/Left" + RIGHT = "/Right" + BOTTOM = "/Bottom" + TOP = "/Top" + + +class TypFitArguments: + """Table 8.2 of the PDF 1.7 reference""" + + FIT = "/Fit" + FIT_V = "/FitV" + FIT_BV = "/FitBV" + FIT_B = "/FitB" + FIT_H = "/FitH" + FIT_BH = "/FitBH" + FIT_R = "/FitR" + + +class PageLayouts: + """Page 84, PDF 1.4 reference""" + + SINGLE_PAGE = "/SinglePage" + ONE_COLUMN = "/OneColumn" + TWO_COLUMN_LEFT = "/TwoColumnLeft" + TWO_COLUMN_RIGHT = "/TwoColumnRight" + + +class GraphicsStateParameters: + """Table 4.8 of the 1.7 reference""" + + TYPE = "/Type" # name, optional + LW = "/LW" # number, optional + # TODO: Many more! + FONT = "/Font" # array, optional + S_MASK = "/SMask" # dictionary or name, optional + + +class CatalogDictionary: + """Table 3.25 in the 1.7 reference""" + + TYPE = "/Type" # name, required; must be /Catalog + # TODO: Many more! + + +PDF_KEYS = [ + PagesAttributes, + PageAttributes, + Ressources, + ImageAttributes, + StreamAttributes, + FilterTypes, + LzwFilterParameters, + TypArguments, + TypFitArguments, + PageLayouts, + GraphicsStateParameters, + CatalogDictionary, +] diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 6a409a62e..1006b9e61 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -31,13 +31,23 @@ __author_email__ = "biziqe@mathieu.fenniak.net" import math +from sys import version_info + +from PyPDF2.constants import CcittFaxDecodeParameters as CCITT +from PyPDF2.constants import ColorSpaces +from PyPDF2.constants import FilterTypeAbbreviations as FTA +from PyPDF2.constants import FilterTypes as FT +from PyPDF2.constants import ImageAttributes as IA +from PyPDF2.constants import LzwFilterParameters as LZW +from PyPDF2.constants import StreamAttributes as SA from .utils import PdfReadError, ord_, paethPredictor -from sys import version_info + if version_info < ( 3, 0 ): from cStringIO import StringIO else: from io import StringIO + import struct try: @@ -110,13 +120,13 @@ def decode(data, decodeParms): predictor = 1 if decodeParms: try: - predictor = decodeParms.get("/Predictor", 1) + predictor = decodeParms.get(LZW.PREDICTOR, 1) except AttributeError: pass # usually an array with a null object was read # predictor 1 == no predictor if predictor != 1: - columns = decodeParms["/Columns"] + columns = decodeParms[LZW.COLUMNS] # PNG prediction: if predictor >= 10 and predictor <= 15: output = StringIO() @@ -261,7 +271,7 @@ def decode(self): return baos @staticmethod - def decode(data,decodeParams=None): + def decode(data, decodeParms=None): return LZWDecode.decoder(data).decode() @@ -363,7 +373,7 @@ def decode(data, decodeParms=None, height=0): else: CCITTgroup = 3 - width = decodeParms["/Columns"] + width = decodeParms[CCITT.COLUMNS] imgSize = len(data) tiff_header_struct = '<2shlh' + 'hhll' * 8 + 'h' tiffHeader = struct.pack(tiff_header_struct, @@ -388,7 +398,7 @@ def decode(data, decodeParms=None, height=0): def decodeStreamData(stream): from .generic import NameObject - filters = stream.get("/Filter", ()) + filters = stream.get(SA.FILTER, ()) if len(filters) and not isinstance(filters[0], NameObject): # we have a single filter instance @@ -397,24 +407,24 @@ def decodeStreamData(stream): # If there is not data to decode we should not try to decode the data. if data: for filterType in filters: - if filterType == "/FlateDecode" or filterType == "/Fl": - data = FlateDecode.decode(data, stream.get("/DecodeParms")) - elif filterType == "/ASCIIHexDecode" or filterType == "/AHx": + if filterType == FT.FLATE_DECODE or filterType == FTA.FL: + data = FlateDecode.decode(data, stream.get(SA.DECODE_PARMS)) + elif filterType == FT.ASCII_HEX_DECODE or filterType == FTA.AHx: data = ASCIIHexDecode.decode(data) - elif filterType == "/LZWDecode" or filterType == "/LZW": - data = LZWDecode.decode(data, stream.get("/DecodeParms")) - elif filterType == "/ASCII85Decode" or filterType == "/A85": + elif filterType == FT.LZW_DECODE or filterType == FTA.LZW: + data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS)) + elif filterType == FT.ASCII_85_DECODE or filterType == FTA.A85: data = ASCII85Decode.decode(data) - elif filterType == "/DCTDecode": + elif filterType == FT.DCT_DECODE: data = DCTDecode.decode(data) elif filterType == "/JPXDecode": data = JPXDecode.decode(data) - elif filterType == "/CCITTFaxDecode": - height = stream.get("/Height", ()) - data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height) + elif filterType == FT.CCITT_FAX_DECODE: + height = stream.get(IA.HEIGHT, ()) + data = CCITTFaxDecode.decode(data, stream.get(SA.DECODE_PARMS), height) elif filterType == "/Crypt": - decodeParams = stream.get("/DecodeParams", {}) - if "/Name" not in decodeParams and "/Type" not in decodeParams: + decodeParms = stream.get(SA.DECODE_PARMS, {}) + if "/Name" not in decodeParms and "/Type" not in decodeParms: pass else: raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") @@ -434,34 +444,37 @@ def _xobj_to_image(x_object_obj): :return: Tuple[file extension, bytes] """ import io + from PIL import Image - size = (x_object_obj["/Width"], x_object_obj["/Height"]) + from PyPDF2.constants import GraphicsStateParameters as G + + size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT]) data = x_object_obj.getData() - if x_object_obj["/ColorSpace"] == "/DeviceRGB": + if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB: mode = "RGB" else: mode = "P" extension = None - if "/Filter" in x_object_obj: - if x_object_obj["/Filter"] == "/FlateDecode": + if SA.FILTER in x_object_obj: + if x_object_obj[SA.FILTER] == FT.FLATE_DECODE: extension = ".png" img = Image.frombytes(mode, size, data) - if "/SMask" in x_object_obj: # add alpha channel - alpha = Image.frombytes("L", size, x_object_obj["/SMask"].getData()) + if G.S_MASK in x_object_obj: # add alpha channel + alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].getData()) img.putalpha(alpha) img_byte_arr = io.BytesIO() img.save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() - elif x_object_obj["/Filter"] in (["/LZWDecode"], ['/ASCII85Decode'], ['/CCITTFaxDecode']): + elif x_object_obj[SA.FILTER] in ([FT.LZW_DECODE], [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE]): from PyPDF2.utils import b_ extension = ".png" data = b_(data) - elif x_object_obj["/Filter"] == "/DCTDecode": + elif x_object_obj[SA.FILTER] == FT.DCT_DECODE: extension = ".jpg" - elif x_object_obj["/Filter"] == "/JPXDecode": + elif x_object_obj[SA.FILTER] == "/JPXDecode": extension = ".jp2" - elif x_object_obj["/Filter"] == "/CCITTFaxDecode": + elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE: extension = ".tiff" else: extension = ".png" diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index c0691694d..09a38ac0d 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -32,18 +32,19 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" +import codecs +import decimal import re -from .utils import readNonWhitespace, RC4_encrypt, skipOverComment -from .utils import b_, u_, chr_, ord_ -from .utils import PdfStreamError import warnings -from . import filters -from . import utils -import decimal -import codecs +from PyPDF2.constants import FilterTypes as FT +from PyPDF2.constants import StreamAttributes as SA from PyPDF2.utils import ERR_STREAM_TRUNCATED_PREMATURELY +from . import filters, utils +from .utils import (PdfStreamError, RC4_encrypt, b_, chr_, ord_, + readNonWhitespace, skipOverComment, u_) + ObjectPrefix = b_('/<[tf(n%') NumberSigns = b_('+-') IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")) @@ -604,8 +605,8 @@ def readFromStream(stream, pdf): if stream.read(1) != b_('\n'): stream.seek(-1, 1) # this is a stream object, not a dictionary - assert "/Length" in data - length = data["/Length"] + assert SA.LENGTH in data + length = data[SA.LENGTH] if debug: print(data) if isinstance(length, IndirectObject): t = stream.tell() @@ -780,9 +781,9 @@ def __init__(self): self.decodedSelf = None def writeToStream(self, stream, encryption_key): - self[NameObject("/Length")] = NumberObject(len(self._data)) + self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) DictionaryObject.writeToStream(self, stream, encryption_key) - del self["/Length"] + del self[SA.LENGTH] stream.write(b_("\nstream\n")) data = self._data if encryption_key: @@ -791,22 +792,22 @@ def writeToStream(self, stream, encryption_key): stream.write(b_("\nendstream")) def initializeFromDictionary(data): - if "/Filter" in data: + if SA.FILTER in data: retval = EncodedStreamObject() else: retval = DecodedStreamObject() retval._data = data["__streamdata__"] del data["__streamdata__"] - del data["/Length"] + del data[SA.LENGTH] retval.update(data) return retval initializeFromDictionary = staticmethod(initializeFromDictionary) # type: ignore def flateEncode(self): - if "/Filter" in self: - f = self["/Filter"] + if SA.FILTER in self: + f = self[SA.FILTER] if isinstance(f, ArrayObject): - f.insert(0, NameObject("/FlateDecode")) + f.insert(0, NameObject(FT.FLATE_DECODE)) else: newf = ArrayObject() newf.append(NameObject("/FlateDecode")) @@ -815,7 +816,7 @@ def flateEncode(self): else: f = NameObject("/FlateDecode") retval = EncodedStreamObject() - retval[NameObject("/Filter")] = f + retval[NameObject(SA.FILTER)] = f retval._data = filters.FlateDecode.encode(self._data) return retval @@ -842,7 +843,7 @@ def getData(self): decoded._data = filters.decodeStreamData(self) for key, value in list(self.items()): - if key not in ("/Length", "/Filter", "/DecodeParms"): + if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): decoded[key] = value self.decodedSelf = decoded return decoded._data @@ -1061,18 +1062,21 @@ def __init__(self, title, page, typ, *args): self[NameObject("/Page")] = page self[NameObject("/Type")] = typ + from PyPDF2.constants import TypArguments as TA + from PyPDF2.constants import TypFitArguments as TF + # from table 8.2 of the PDF 1.7 reference. if typ == "/XYZ": - (self[NameObject("/Left")], self[NameObject("/Top")], + (self[NameObject(TA.LEFT)], self[NameObject(TA.TOP)], self[NameObject("/Zoom")]) = args - elif typ == "/FitR": - (self[NameObject("/Left")], self[NameObject("/Bottom")], - self[NameObject("/Right")], self[NameObject("/Top")]) = args - elif typ in ["/FitH", "/FitBH"]: - self[NameObject("/Top")], = args - elif typ in ["/FitV", "/FitBV"]: - self[NameObject("/Left")], = args - elif typ in ["/Fit", "/FitB"]: + elif typ == TF.FIT_R: + (self[NameObject(TA.LEFT)], self[NameObject(TA.BOTTOM)], + self[NameObject(TA.RIGHT)], self[NameObject(TA.TOP)]) = args + elif typ in [TF.FIT_H, TF.FIT_BH]: + self[NameObject(TA.TOP)], = args + elif typ in [TF.FIT_V, TF.FIT_BV]: + self[NameObject(TA.LEFT)], = args + elif typ in [TF.FIT, TF.FIT_B]: pass else: raise utils.PdfReadError("Unknown Destination Type: %r" % typ) diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py index d5fd22414..670629d24 100644 --- a/PyPDF2/merger.py +++ b/PyPDF2/merger.py @@ -25,11 +25,15 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from sys import version_info + +from PyPDF2.constants import PagesAttributes as PA + from .generic import * -from .utils import isString, str_ -from .pdf import PdfFileReader, PdfFileWriter from .pagerange import PageRange -from sys import version_info +from .pdf import PdfFileReader, PdfFileWriter +from .utils import isString, str_ + if version_info < ( 3, 0 ): from cStringIO import StringIO StreamIO = StringIO @@ -216,8 +220,8 @@ def write(self, fileobj): # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13 for page in self.pages: self.output.addPage(page.pagedata) - page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject()) - # idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1 + page.out_pagedata = self.output.getReference(self.output._pages.getObject()[PA.KIDS][-1].getObject()) + # idnum = self.output._objects.index(self.output._pages.getObject()[PA.KIDS][-1].getObject()) + 1 # page.out_pagedata = IndirectObject(idnum, 0, self.output) # Once all pages are added, create bookmarks to point at those pages @@ -543,7 +547,7 @@ def remove(self, index): self.tree.removeChild(obj) def add(self, title, pagenum): - pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] + pageRef = self.pdf.getObject(self.pdf._pages)[PA.KIDS][pagenum] action = DictionaryObject() action.update({ NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py index 73fcad77f..aa532e704 100644 --- a/PyPDF2/pagerange.py +++ b/PyPDF2/pagerange.py @@ -8,6 +8,7 @@ """ import re + from .utils import isString _INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index fd8068aeb..8a6be0a75 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -29,21 +29,17 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -""" -A pure-Python PDF library with an increasing number of capabilities. -""" +"""A pure-Python PDF library with an increasing number of capabilities.""" __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" -__maintainer__ = "Phaseit, Inc." -__maintainer_email = "PyPDF2@phaseit.net" - import math import struct import sys import uuid from sys import version_info + if version_info < ( 3, 0 ): from cStringIO import StringIO else: @@ -54,12 +50,18 @@ else: from io import BytesIO -from . import utils -import warnings import codecs +import warnings + +from PyPDF2.constants import PageAttributes as PG +from PyPDF2.constants import PagesAttributes as PA +from PyPDF2.constants import Ressources as RES +from PyPDF2.constants import StreamAttributes as SA + +from . import utils from .generic import * -from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList -from .utils import isString, b_, u_, ord_, str_, formatWarning +from .utils import (ConvertFunctionsToVirtualList, b_, formatWarning, isString, + ord_, readNonWhitespace, readUntilWhitespace, str_, u_) if version_info < ( 2, 4 ): from sets import ImmutableSet as frozenset @@ -82,9 +84,9 @@ def __init__(self): # The root of our page tree node. pages = DictionaryObject() pages.update({ - NameObject("/Type"): NameObject("/Pages"), - NameObject("/Count"): NumberObject(0), - NameObject("/Kids"): ArrayObject(), + NameObject(PA.TYPE): NameObject("/Pages"), + NameObject(PA.COUNT): NumberObject(0), + NameObject(PA.KIDS): ArrayObject(), }) self._pages = self._addObject(pages) @@ -98,7 +100,7 @@ def __init__(self): # root object root = DictionaryObject() root.update({ - NameObject("/Type"): NameObject("/Catalog"), + NameObject(PA.TYPE): NameObject("/Catalog"), NameObject("/Pages"): self._pages, }) self._root = None @@ -114,12 +116,12 @@ def getObject(self, ido): return self._objects[ido.idnum - 1] def _addPage(self, page, action): - assert page["/Type"] == "/Page" + assert page[PA.TYPE] == "/Page" page[NameObject("/Parent")] = self._pages page = self._addObject(page) pages = self.getObject(self._pages) - action(pages["/Kids"], page) - pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) + action(pages[PA.KIDS], page) + pages[NameObject(PA.COUNT)] = NumberObject(pages[PA.COUNT] + 1) def addPage(self, page): """ @@ -153,7 +155,7 @@ def getPage(self, pageNumber): """ pages = self.getObject(self._pages) # XXX: crude hack - return pages["/Kids"][pageNumber].getObject() + return pages[PA.KIDS][pageNumber].getObject() def getNumPages(self): """ @@ -216,7 +218,7 @@ def addJS(self, javascript): """ js = DictionaryObject() js.update({ - NameObject("/Type"): NameObject("/Action"), + NameObject(PA.TYPE): NameObject("/Action"), NameObject("/S"): NameObject("/JavaScript"), NameObject("/JS"): NameObject("(%s)" % javascript) }) @@ -271,7 +273,7 @@ def addAttachment(self, fname, fdata): file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({ - NameObject("/Type"): NameObject("/EmbeddedFile") + NameObject(PA.TYPE): NameObject("/EmbeddedFile") }) # The Filespec entry @@ -288,7 +290,7 @@ def addAttachment(self, fname, fdata): filespec = DictionaryObject() filespec.update({ - NameObject("/Type"): NameObject("/Filespec"), + NameObject(PA.TYPE): NameObject("/Filespec"), NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject NameObject("/EF"): efEntry }) @@ -355,8 +357,8 @@ def updatePageFormFieldValues(self, page, fields): values (/V) ''' # Iterate through pages, update field values - for j in range(0, len(page['/Annots'])): - writer_annot = page['/Annots'][j].getObject() + for j in range(0, len(page[PG.ANNOTS])): + writer_annot = page[PG.ANNOTS][j].getObject() for field in fields: if writer_annot.get('/T') == field: writer_annot.update({ @@ -401,7 +403,8 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): encryption. When false, 40bit encryption will be used. By default, this flag is on. """ - import time, random + import random + import time if owner_pwd is None: owner_pwd = user_pwd if use_128bit: @@ -424,10 +427,10 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): assert rev == 3 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) encrypt = DictionaryObject() - encrypt[NameObject("/Filter")] = NameObject("/Standard") + encrypt[NameObject(SA.FILTER)] = NameObject("/Standard") encrypt[NameObject("/V")] = NumberObject(V) if V == 2: - encrypt[NameObject("/Length")] = NumberObject(keylen * 8) + encrypt[NameObject(SA.LENGTH)] = NumberObject(keylen * 8) encrypt[NameObject("/R")] = NumberObject(rev) encrypt[NameObject("/O")] = ByteStringObject(O) encrypt[NameObject("/U")] = ByteStringObject(U) @@ -702,7 +705,7 @@ def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, itali :param str fit: The fit of the destination page. See :meth:`addLink()` for details. """ - pageRef = self.getObject(self._pages)['/Kids'][pagenum] + pageRef = self.getObject(self._pages)[PA.KIDS][pagenum] action = DictionaryObject() zoomArgs = [] for a in args: @@ -757,7 +760,7 @@ def addNamedDestinationObject(self, dest): return destRef def addNamedDestination(self, title, pagenum): - pageRef = self.getObject(self._pages)['/Kids'][pagenum] + pageRef = self.getObject(self._pages)[PA.KIDS][pagenum] dest = DictionaryObject() dest.update({ NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), @@ -775,11 +778,11 @@ def removeLinks(self): """ Removes links and annotations from this output. """ - pages = self.getObject(self._pages)['/Kids'] + pages = self.getObject(self._pages)[PA.KIDS] for page in pages: pageRef = self.getObject(page) - if "/Annots" in pageRef: - del pageRef['/Annots'] + if PG.ANNOTS in pageRef: + del pageRef[PG.ANNOTS] def removeImages(self, ignoreByteStringObject=False): """ @@ -788,7 +791,7 @@ def removeImages(self, ignoreByteStringObject=False): :param bool ignoreByteStringObject: optional parameter to ignore ByteString Objects. """ - pages = self.getObject(self._pages)['/Kids'] + pages = self.getObject(self._pages)[PA.KIDS] jump_operators = [ b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), @@ -841,7 +844,7 @@ def removeText(self, ignoreByteStringObject=False): :param bool ignoreByteStringObject: optional parameter to ignore ByteString Objects. """ - pages = self.getObject(self._pages)['/Kids'] + pages = self.getObject(self._pages)[PA.KIDS] for j in range(len(pages)): page = pages[j] pageRef = self.getObject(page) @@ -894,7 +897,7 @@ def addURI(self, pagenum, uri, rect, border=None): -John Mulligan """ - pageLink = self.getObject(self._pages)['/Kids'][pagenum] + pageLink = self.getObject(self._pages)[PA.KIDS][pagenum] pageRef = self.getObject(pageLink) if border is not None: @@ -919,7 +922,7 @@ def addURI(self, pagenum, uri, rect, border=None): }); lnk = DictionaryObject() lnk.update({ - NameObject('/Type'): NameObject('/Annot'), + NameObject('/Type'): NameObject(PG.ANNOTS), NameObject('/Subtype'): NameObject('/Link'), NameObject('/P'): pageLink, NameObject('/Rect'): rect, @@ -929,10 +932,10 @@ def addURI(self, pagenum, uri, rect, border=None): }) lnkRef = self._addObject(lnk) - if "/Annots" in pageRef: - pageRef['/Annots'].append(lnkRef) + if PG.ANNOTS in pageRef: + pageRef[PG.ANNOTS].append(lnkRef) else: - pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) + pageRef[NameObject(PG.ANNOTS)] = ArrayObject([lnkRef]) def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): """ @@ -970,8 +973,8 @@ def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): - [left] """ - pageLink = self.getObject(self._pages)['/Kids'][pagenum] - pageDest = self.getObject(self._pages)['/Kids'][pagedest] # TODO: switch for external link + pageLink = self.getObject(self._pages)[PA.KIDS][pagenum] + pageDest = self.getObject(self._pages)[PA.KIDS][pagedest] # TODO: switch for external link pageRef = self.getObject(pageLink) if border is not None: @@ -1000,7 +1003,7 @@ def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): lnk = DictionaryObject() lnk.update({ - NameObject('/Type'): NameObject('/Annot'), + NameObject('/Type'): NameObject(PG.ANNOTS), NameObject('/Subtype'): NameObject('/Link'), NameObject('/P'): pageLink, NameObject('/Rect'): rect, @@ -1009,10 +1012,10 @@ def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): }) lnkRef = self._addObject(lnk) - if "/Annots" in pageRef: - pageRef['/Annots'].append(lnkRef) + if PG.ANNOTS in pageRef: + pageRef[PG.ANNOTS].append(lnkRef) else: - pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) + pageRef[NameObject(PG.ANNOTS)] = ArrayObject([lnkRef]) _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight'] @@ -1318,9 +1321,9 @@ def _buildField(self, field, retval, fileobj, fieldAttributes): retval[key] = Field(field) def _checkKids(self, tree, retval, fileobj): - if "/Kids" in tree: + if PA.KIDS in tree: # recurse down the tree - for kid in tree["/Kids"]: + for kid in tree[PA.KIDS]: self.getFields(kid.getObject(), retval, fileobj) def _writeField(self, fileobj, field, fieldAttributes): @@ -1380,9 +1383,9 @@ def getNamedDestinations(self, tree=None, retval=None): if tree is None: return retval - if "/Kids" in tree: + if PA.KIDS in tree: # recurse down the tree - for kid in tree["/Kids"]: + for kid in tree[PA.KIDS]: self.getNamedDestinations(kid.getObject(), retval) if "/Names" in tree: @@ -1569,25 +1572,27 @@ def getPageMode(self): def _flatten(self, pages=None, inherit=None, indirectRef=None): inheritablePageAttributes = ( - NameObject("/Resources"), NameObject("/MediaBox"), - NameObject("/CropBox"), NameObject("/Rotate") + NameObject(PG.RESOURCES), NameObject(PG.MEDIABOX), + NameObject(PG.CROPBOX), NameObject(PG.ROTATE) ) if inherit is None: inherit = dict() if pages is None: - self.flattenedPages = [] + # Fix issue 327: set flattenedPages attribute only for + # decrypted file catalog = self.trailer["/Root"].getObject() pages = catalog["/Pages"].getObject() + self.flattenedPages = [] t = "/Pages" - if "/Type" in pages: - t = pages["/Type"] + if PA.TYPE in pages: + t = pages[PA.TYPE] if t == "/Pages": for attr in inheritablePageAttributes: if attr in pages: inherit[attr] = pages[attr] - for page in pages["/Kids"]: + for page in pages[PA.KIDS]: addt = {} if isinstance(page, IndirectObject): addt["indirectRef"] = page @@ -1732,6 +1737,7 @@ def readObjectHeader(self, stream): idnum = readUntilWhitespace(stream) extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) generation = readUntilWhitespace(stream) + extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) # although it's not used, it might still be necessary to read _obj = stream.read(3) # noqa: F841 @@ -2090,7 +2096,7 @@ def _decrypt(self, password): if rev == 2: keylen = 5 else: - keylen = encrypt['/Length'].getObject() // 8 + keylen = encrypt[SA.LENGTH].getObject() // 8 key = _alg33_1(password, rev, keylen) real_O = encrypt["/O"].getObject() if rev == 2: @@ -2121,7 +2127,7 @@ def _authenticateUserPassword(self, password): U, key = _alg34(password, owner_entry, p_entry, id1_entry) elif rev >= 3: U, key = _alg35(password, rev, - encrypt["/Length"].getObject() // 8, owner_entry, + encrypt[SA.LENGTH].getObject() // 8, owner_entry, p_entry, id1_entry, encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) U, real_U = U[:16], real_U[:16] @@ -2212,7 +2218,7 @@ def createBlankPage(pdf=None, width=None, height=None): # Creates a new page (cf PDF Reference 7.7.3.3) page.__setitem__(NameObject('/Type'), NameObject('/Page')) page.__setitem__(NameObject('/Parent'), NullObject()) - page.__setitem__(NameObject('/Resources'), DictionaryObject()) + page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) if width is None or height is None: if pdf is not None and pdf.getNumPages() > 0: lastpage = pdf.getPage(pdf.getNumPages() - 1) @@ -2220,7 +2226,7 @@ def createBlankPage(pdf=None, width=None, height=None): height = lastpage.mediaBox.getHeight() else: raise utils.PageSizeNotDefinedError() - page.__setitem__(NameObject('/MediaBox'), + page.__setitem__(NameObject(PG.MEDIABOX), RectangleObject([0, 0, width, height])) return page @@ -2341,27 +2347,27 @@ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): newResources = DictionaryObject() rename = {} - originalResources = self["/Resources"].getObject() - page2Resources = page2["/Resources"].getObject() + originalResources = self[PG.RESOURCES].getObject() + page2Resources = page2[PG.RESOURCES].getObject() newAnnots = ArrayObject() for page in (self, page2): - if "/Annots" in page: - annots = page["/Annots"] + if PG.ANNOTS in page: + annots = page[PG.ANNOTS] if isinstance(annots, ArrayObject): for ref in annots: newAnnots.append(ref) - for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": + for res in "/ExtGState", RES.FONT, RES.XOBJECT, RES.COLOR_SPACE, "/Pattern", "/Shading", "/Properties": new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) if new: newResources[NameObject(res)] = new rename.update(newrename) # Combine /ProcSet sets. - newResources[NameObject("/ProcSet")] = ArrayObject( - frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( - frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) + newResources[NameObject(RES.PROCSET)] = ArrayObject( + frozenset(originalResources.get(RES.PROCSET, ArrayObject()).getObject()).union( + frozenset(page2Resources.get(RES.PROCSET, ArrayObject()).getObject()) ) ) @@ -2405,8 +2411,8 @@ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): self.mediaBox.setUpperRight(upperright) self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) - self[NameObject('/Resources')] = newResources - self[NameObject('/Annots')] = newAnnots + self[NameObject(PG.RESOURCES)] = newResources + self[NameObject(PG.ANNOTS)] = newAnnots def mergeTransformedPage(self, page2, ctm, expand=False): """ @@ -2714,14 +2720,14 @@ def extractText(self, Tj_sep="", TJ_sep=" "): text += "\n" return text - mediaBox = createRectangleAccessor("/MediaBox", ()) + mediaBox = createRectangleAccessor(PG.MEDIABOX, ()) """ A :class:`RectangleObject`, expressed in default user space units, defining the boundaries of the physical medium on which the page is intended to be displayed or printed. """ - cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) + cropBox = createRectangleAccessor("/CropBox", (PG.MEDIABOX,)) """ A :class:`RectangleObject`, expressed in default user space units, defining the visible region of default user space. When the page is @@ -2730,20 +2736,20 @@ def extractText(self, Tj_sep="", TJ_sep=" "): implementation-defined manner. Default value: same as :attr:`mediaBox`. """ - bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) + bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) """ A :class:`RectangleObject`, expressed in default user space units, defining the region to which the contents of the page should be clipped when output in a production enviroment. """ - trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) + trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) """ A :class:`RectangleObject`, expressed in default user space units, defining the intended dimensions of the finished page after trimming. """ - artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) + artBox = createRectangleAccessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) """ A :class:`RectangleObject`, expressed in default user space units, defining the extent of the page's meaningful content as intended by the diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py index 639471d4a..f528bfa06 100644 --- a/PyPDF2/utils.py +++ b/PyPDF2/utils.py @@ -197,10 +197,10 @@ def markLocation(stream): # Mainly for debugging RADIUS = 5000 stream.seek(-RADIUS, 1) - with open('PyPDF2_pdfLocation.txt', 'wb') as outputDoc: - outputDoc.write(stream.read(RADIUS)) - outputDoc.write(b'HERE') - outputDoc.write(stream.read(RADIUS)) + with open('PyPDF2_pdfLocation.txt', 'wb') as output_fh: + output_fh.write(stream.read(RADIUS)) + output_fh.write(b'HERE') + output_fh.write(stream.read(RADIUS)) stream.seek(-RADIUS, 1) diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py index 9aec5e017..206317f35 100644 --- a/PyPDF2/xmp.py +++ b/PyPDF2/xmp.py @@ -1,8 +1,9 @@ -import re import datetime import decimal -from .generic import PdfObject +import re from xml.dom.minidom import parseString + +from .generic import PdfObject from .utils import u_ RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" diff --git a/Resources/encrypted-file.pdf b/Resources/encrypted-file.pdf new file mode 100644 index 000000000..c300a5fed Binary files /dev/null and b/Resources/encrypted-file.pdf differ diff --git a/Scripts/pdf-image-extractor.py b/Scripts/pdf-image-extractor.py index c2e2aa00e..cc935aad4 100644 --- a/Scripts/pdf-image-extractor.py +++ b/Scripts/pdf-image-extractor.py @@ -8,17 +8,17 @@ import sys import PyPDF2 from PyPDF2.filters import _xobj_to_image - +from PyPDF2.constants import PageAttributes as PG, ImageAttributes as IA, Ressources as RES def main(pdf: str): reader = PyPDF2.PdfFileReader(pdf) page = reader.pages[30] - if "/XObject" in page["/Resources"]: - xObject = page["/Resources"]["/XObject"].getObject() + if RES.XOBJECT in page[PG.RESOURCES]: + xObject = page[PG.RESOURCES][RES.XOBJECT].getObject() for obj in xObject: - if xObject[obj]["/Subtype"] == "/Image": + if xObject[obj][IA.SUBTYPE] == "/Image": extension, byte_stream = _xobj_to_image(xObject[obj]) if extension is not None: filename = obj[1:] + ".png" diff --git a/Tests/test_basic_features.py b/Tests/test_basic_features.py index 63b2a0a66..0c9e9257f 100644 --- a/Tests/test_basic_features.py +++ b/Tests/test_basic_features.py @@ -12,9 +12,9 @@ def test_basic_features(): - writer = PdfFileWriter() pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") reader = PdfFileReader(pdf_path) + writer = PdfFileWriter() # print how many pages input1 has: print("document1.pdf has %d pages." % reader.getNumPages()) @@ -54,8 +54,12 @@ def test_basic_features(): writer.encrypt(password) # finally, write "output" to PyPDF2-output.pdf - with open("PyPDF2-output.pdf", "wb") as outputStream: - writer.write(outputStream) + tmp_path = "PyPDF2-output.pdf" + with open(tmp_path, "wb") as output_stream: + writer.write(output_stream) + + # cleanup + os.remove(tmp_path) def test_convertToInt(): diff --git a/Tests/test_constants.py b/Tests/test_constants.py new file mode 100644 index 000000000..fa34357c5 --- /dev/null +++ b/Tests/test_constants.py @@ -0,0 +1,15 @@ +import re + +from PyPDF2.constants import PDF_KEYS + + +def test_slash_prefix(): + pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$") + for cls in PDF_KEYS: + for attr in dir(cls): + if attr.startswith("__") and attr.endswith("__"): + continue + constant_value = getattr(cls, attr) + assert constant_value.startswith("/") + assert pattern.match(constant_value) + assert attr.replace("_", "").lower() == constant_value[1:].lower() diff --git a/Tests/test_javascript.py b/Tests/test_javascript.py index 4048a76f0..48ad9530e 100644 --- a/Tests/test_javascript.py +++ b/Tests/test_javascript.py @@ -13,9 +13,9 @@ @pytest.fixture def pdf_file_writer(): reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) - pdf_file_writer = PdfFileWriter() - pdf_file_writer.appendPagesFromReader(reader) - yield pdf_file_writer + writer = PdfFileWriter() + writer.appendPagesFromReader(reader) + yield writer def test_add_js(pdf_file_writer): diff --git a/Tests/test_merger.py b/Tests/test_merger.py index 959560d4a..072fbcb14 100644 --- a/Tests/test_merger.py +++ b/Tests/test_merger.py @@ -43,5 +43,9 @@ def test_merge(): file_merger.setPageLayout("/SinglePage") file_merger.setPageMode("/UseThumbs") - file_merger.write("dont_commit_merged.pdf") + tmp_path = "dont_commit_merged.pdf" + file_merger.write(tmp_path) file_merger.close() + + # Clean up + os.remove(tmp_path) diff --git a/Tests/test_page.py b/Tests/test_page.py index bed852080..c5ea98662 100644 --- a/Tests/test_page.py +++ b/Tests/test_page.py @@ -12,25 +12,16 @@ @pytest.mark.parametrize( "pdf_path, password", [ - (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), None), - (os.path.join(RESOURCE_ROOT, "attachment.pdf"), None), - (os.path.join(RESOURCE_ROOT, "side-by-side-subfig.pdf"), None), + ("crazyones.pdf", None), + ("attachment.pdf", None), + # ("side-by-side-subfig.pdf", None), ( - os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), + "libreoffice-writer-password.pdf", "openpassword", ), - (os.path.join(RESOURCE_ROOT, "imagemagick-images.pdf"), None), - (os.path.join(RESOURCE_ROOT, "imagemagick-lzw.pdf"), None), - (os.path.join(RESOURCE_ROOT, "reportlab-inline-image.pdf"), None), - ], - ids=[ - "crazyones", - "attachment", - "side-by-side-subfig", - "libreoffice-writer-password", - "imagemagick-images", - "imagemagick-lzw", - "reportlab-inline-image" + ("imagemagick-images.pdf", None), + ("imagemagick-lzw.pdf", None), + ("reportlab-inline-image.pdf", None), ], ) def test_page_operations(pdf_path, password): @@ -40,6 +31,7 @@ def test_page_operations(pdf_path, password): This should be done way more thoroughly: It should be checked if the output is as expected. """ + pdf_path = os.path.join(RESOURCE_ROOT, pdf_path) reader = PdfFileReader(pdf_path) if password: diff --git a/Tests/test_pagerange.py b/Tests/test_pagerange.py index b90feef00..dd3c97954 100644 --- a/Tests/test_pagerange.py +++ b/Tests/test_pagerange.py @@ -62,6 +62,7 @@ def test_parse_filename_page_ranges_err(): def test_page_range_help(): from PyPDF2.pagerange import PAGE_RANGE_HELP + assert len(PAGE_RANGE_HELP) > 20 assert "0:3" in PAGE_RANGE_HELP assert PAGE_RANGE_HELP.endswith("\n") diff --git a/Tests/test_reader.py b/Tests/test_reader.py index a58130d80..d7d88bb64 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -5,6 +5,9 @@ import PyPDF2.utils from PyPDF2 import PdfFileReader +from PyPDF2.constants import ImageAttributes as IA +from PyPDF2.constants import PageAttributes as PG +from PyPDF2.constants import Ressources as RES from PyPDF2.filters import _xobj_to_image TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -51,8 +54,8 @@ def test_read_metadata(pdf_path, expected): docinfo = reader.getDocumentInfo() metadict = dict(docinfo) assert metadict == expected - if '/Title' in metadict: - assert metadict['/Title'] == docinfo.title + if "/Title" in metadict: + assert metadict["/Title"] == docinfo.title @pytest.mark.parametrize( @@ -66,9 +69,9 @@ def test_get_annotations(src): reader = PdfFileReader(src) for page in reader.pages: - if "/Annots" in page: - for annot in page["/Annots"]: - subtype = annot.getObject()["/Subtype"] + if PG.ANNOTS in page: + for annot in page[PG.ANNOTS]: + subtype = annot.getObject()[IA.SUBTYPE] if subtype == "/Text": annot.getObject()["/Contents"] @@ -86,10 +89,10 @@ def test_get_attachments(src): attachments = {} for i in range(reader.getNumPages()): page = reader.getPage(i) - if "/Annots" in page: - for annotation in page["/Annots"]: + if PG.ANNOTS in page: + for annotation in page[PG.ANNOTS]: annotobj = annotation.getObject() - if annotobj["/Subtype"] == "/FileAttachment": + if annotobj[IA.SUBTYPE] == "/FileAttachment": fileobj = annotobj["/FS"] attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].getData() return attachments @@ -120,7 +123,7 @@ def test_get_outlines(src, outline_elements): ], ) def test_get_images(src, nb_images): - src =os.path.join(RESOURCE_ROOT, src) + src = os.path.join(RESOURCE_ROOT, src) reader = PdfFileReader(src) with pytest.raises(TypeError): @@ -131,11 +134,11 @@ def test_get_images(src, nb_images): images_extracted = [] - if "/XObject" in page["/Resources"]: - xObject = page["/Resources"]["/XObject"].getObject() + if RES.XOBJECT in page[PG.RESOURCES]: + xObject = page[PG.RESOURCES][RES.XOBJECT].getObject() for obj in xObject: - if xObject[obj]["/Subtype"] == "/Image": + if xObject[obj][IA.SUBTYPE] == "/Image": extension, byte_stream = _xobj_to_image(xObject[obj]) if extension is not None: filename = obj[1:] + ".png" @@ -207,3 +210,19 @@ def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") reader = PdfFileReader(path, "rb") reader.getPage(0) + + +def test_get_page_of_encrypted_file(): + """ + Check if we can read a page of an encrypted file. + + This is a regression test for issue 327: + IndexError for getPage() of decrypted file + """ + path = os.path.join(RESOURCE_ROOT, "encrypted-file.pdf") + reader = PdfFileReader(path) + + # Password is correct:) + reader.decrypt("test") + + reader.getPage(0) diff --git a/Tests/test_utils.py b/Tests/test_utils.py index fb31edb9f..998172bbc 100644 --- a/Tests/test_utils.py +++ b/Tests/test_utils.py @@ -85,6 +85,7 @@ def test_matrixMultiply(a, b, expected): def test_markLocation(): stream = io.BytesIO(b"abde" * 6000) PyPDF2.utils.markLocation(stream) + os.remove("PyPDF2_pdfLocation.txt") # cleanup def test_ConvertFunctionsToVirtualList(): diff --git a/Tests/test_workflows.py b/Tests/test_workflows.py index 821fa7a49..300430657 100644 --- a/Tests/test_workflows.py +++ b/Tests/test_workflows.py @@ -7,6 +7,7 @@ import pytest from PyPDF2 import PdfFileReader +from PyPDF2.constants import PageAttributes as PG TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -23,19 +24,19 @@ def test_PdfReaderFileLoad(): with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile: # Load PDF file from file - ipdf = PdfFileReader(inputfile) - ipdf_p1 = ipdf.getPage(0) + reader = PdfFileReader(inputfile) + page = reader.getPage(0) # Retrieve the text of the PDF with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file: pdftext = pdftext_file.read() - ipdf_p1_text = ipdf_p1.extractText().replace("\n", "").encode("utf-8") + text = page.extractText().replace("\n", "").encode("utf-8") # Compare the text of the PDF to a known source - assert ipdf_p1_text == pdftext, ( + assert text == pdftext, ( "PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n" - % (pdftext, ipdf_p1_text) + % (pdftext, text) ) @@ -47,15 +48,15 @@ def test_PdfReaderJpegImage(): with open(os.path.join(RESOURCE_ROOT, "jpeg.pdf"), "rb") as inputfile: # Load PDF file from file - ipdf = PdfFileReader(inputfile) + reader = PdfFileReader(inputfile) # Retrieve the text of the image with open(os.path.join(RESOURCE_ROOT, "jpeg.txt"), "r") as pdftext_file: imagetext = pdftext_file.read() - ipdf_p0 = ipdf.getPage(0) - xObject = ipdf_p0["/Resources"]["/XObject"].getObject() - data = xObject["/Im4"].getData() + page = reader.getPage(0) + x_object = page[PG.RESOURCES]["/XObject"].getObject() + data = x_object["/Im4"].getData() # Compare the text of the PDF to a known source assert binascii.hexlify(data).decode() == imagetext, ( @@ -68,32 +69,32 @@ def test_decrypt(): with open( os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), "rb" ) as inputfile: - ipdf = PdfFileReader(inputfile) - assert ipdf.isEncrypted == True - ipdf.decrypt("openpassword") - assert ipdf.getNumPages() == 1 - assert ipdf.isEncrypted == True - metadict = ipdf.getDocumentInfo() + reader = PdfFileReader(inputfile) + assert reader.isEncrypted == True + reader.decrypt("openpassword") + assert reader.getNumPages() == 1 + assert reader.isEncrypted == True + metadict = reader.getDocumentInfo() assert dict(metadict) == { "/CreationDate": "D:20220403203552+02'00'", "/Creator": "Writer", "/Producer": "LibreOffice 6.4", } # Is extractText() broken for encrypted files? - # assert ipdf.getPage(0).extractText().replace('\n', '') == "\n˘\n\u02c7\u02c6˙\n\n\n˘\u02c7\u02c6˙\n\n" + # assert reader.getPage(0).extractText().replace('\n', '') == "\n˘\n\u02c7\u02c6˙\n\n\n˘\u02c7\u02c6˙\n\n" @pytest.mark.parametrize("degree", [0, 90, 180, 270, 360, -90]) def test_rotate(degree): with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile: - ipdf = PdfFileReader(inputfile) - page = ipdf.getPage(0) + reader = PdfFileReader(inputfile) + page = reader.getPage(0) page.rotateCounterClockwise(degree) def test_rotate_45(): with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile: - ipdf = PdfFileReader(inputfile) - page = ipdf.getPage(0) + reader = PdfFileReader(inputfile) + page = reader.getPage(0) with pytest.raises(AssertionError): page.rotateCounterClockwise(45) diff --git a/Tests/test_writer.py b/Tests/test_writer.py index 8356c94bc..c7b058878 100644 --- a/Tests/test_writer.py +++ b/Tests/test_writer.py @@ -24,56 +24,60 @@ def test_writer_operations(): reader = PdfFileReader(pdf_path) reader_outline = PdfFileReader(pdf_outline_path) - output = PdfFileWriter() + writer = PdfFileWriter() page = reader.pages[0] with pytest.raises(PageSizeNotDefinedError): - output.addBlankPage() - output.insertPage(page, 1) - output.removeText() - output.insertPage(reader_outline.pages[0], 0) - output.addBookmarkDestination(page) - output.addBookmark("A bookmark", 0) + writer.addBlankPage() + writer.insertPage(page, 1) + writer.removeText() + writer.insertPage(reader_outline.pages[0], 0) + writer.addBookmarkDestination(page) + writer.addBookmark("A bookmark", 0) # output.addNamedDestination("A named destination", 1) - output.removeLinks() + writer.removeLinks() # assert output.getNamedDestRoot() == ['A named destination', IndirectObject(9, 0, output)] - output.addBlankPage() - output.addURI(2, "https://example.com", RectangleObject([0, 0, 100, 100])) - output.addLink(2, 1, RectangleObject([0, 0, 100, 100])) - assert output.getPageLayout() is None - output.setPageLayout("SinglePage") - assert output.getPageLayout() == "SinglePage" - assert output.getPageMode() is None - output.setPageMode("UseNone") - assert output.getPageMode() == "UseNone" - output.insertBlankPage(width=100, height=100) - output.insertBlankPage() # without parameters + writer.addBlankPage() + writer.addURI(2, "https://example.com", RectangleObject([0, 0, 100, 100])) + writer.addLink(2, 1, RectangleObject([0, 0, 100, 100])) + assert writer.getPageLayout() is None + writer.setPageLayout("SinglePage") + assert writer.getPageLayout() == "SinglePage" + assert writer.getPageMode() is None + writer.setPageMode("UseNone") + assert writer.getPageMode() == "UseNone" + writer.insertBlankPage(width=100, height=100) + writer.insertBlankPage() # without parameters # This gives "KeyError: '/Contents'" - is that a bug? # output.removeImages() - output.addMetadata({"author": "Martin Thoma"}) + writer.addMetadata({"author": "Martin Thoma"}) - output.addAttachment("foobar.gif", b"foobarcontent") + writer.addAttachment("foobar.gif", b"foobarcontent") # finally, write "output" to PyPDF2-output.pdf - with open("dont_commit_writer.pdf", "wb") as output_stream: - output.write(output_stream) + tmp_path = "dont_commit_writer.pdf" + with open(tmp_path, "wb") as output_stream: + writer.write(output_stream) + + # cleanup + os.remove(tmp_path) def test_remove_images(): pdf_path = os.path.join(RESOURCE_ROOT, "side-by-side-subfig.pdf") reader = PdfFileReader(pdf_path) - output = PdfFileWriter() + writer = PdfFileWriter() page = reader.pages[0] - output.insertPage(page, 0) - output.removeImages() + writer.insertPage(page, 0) + writer.removeImages() # finally, write "output" to PyPDF2-output.pdf tmp_filename = "dont_commit_writer_removed_image.pdf" with open(tmp_filename, "wb") as output_stream: - output.write(output_stream) + writer.write(output_stream) with open(tmp_filename, "rb") as input_stream: reader = PdfFileReader(input_stream) diff --git a/docs/index.rst b/docs/index.rst index 1b339516c..e5cdcd238 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,13 +25,8 @@ You can contribute to `PyPDF2 on Github `_. user/merging-pdfs user/cropping-and-transforming user/add-watermark - - -.. toctree:: - :caption: Scripts - :maxdepth: 1 - - user/pdfcat + user/reading-pdf-annotations + user/adding-pdf-annotations .. toctree:: @@ -59,6 +54,14 @@ You can contribute to `PyPDF2 on Github `_. user/faq +.. toctree:: + :caption: Scripts + :maxdepth: 1 + + user/pdfcat + + + Indices and tables ================== diff --git a/docs/user/adding-pdf-annotations.md b/docs/user/adding-pdf-annotations.md new file mode 100644 index 000000000..215dcfdc7 --- /dev/null +++ b/docs/user/adding-pdf-annotations.md @@ -0,0 +1,16 @@ +# Adding PDF Annotations + +## Attachments + +```python +from PyPDF2 import PdfFileWriter + +writer = PdfFileWriter() +writer.addBlankPage(width=200, height=200) + +data = b"any bytes - typically read from a file" +writer.addAttachment("smile.png", data) + +with open("output.pdf", "wb") as output_stream: + writer.write(output_stream) +``` diff --git a/docs/user/cropping-and-transforming.md b/docs/user/cropping-and-transforming.md index 9791452e5..e1958a4f4 100644 --- a/docs/user/cropping-and-transforming.md +++ b/docs/user/cropping-and-transforming.md @@ -25,7 +25,7 @@ writer.addPage(page3) # comment the the encription lines, if that's the case, to try this out: writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") -# finally, write to document-output.pdf +# write to document-output.pdf with open("PyPDF2-output.pdf", "wb") as fp: - output.write(fp) + writer.write(fp) ``` diff --git a/docs/user/reading-pdf-annotations.md b/docs/user/reading-pdf-annotations.md new file mode 100644 index 000000000..e84abd0a1 --- /dev/null +++ b/docs/user/reading-pdf-annotations.md @@ -0,0 +1,67 @@ +# Reading PDF Annotations + +PDF 1.7 defines 25 different annotation types: + +* Text +* Link +* FreeText +* Line, Square, Circle, Polygon, PolyLine, Highlight, Underline, Squiggly, StrikeOut +* Stamp, Caret, Ink +* Popup +* FileAttachment +* Sound, Movie +* Widget, Screen +* PrinterMark +* TrapNet +* Watermark +* 3D + +Reading the most common ones is described here. + +## Text + +```python +from PyPDF2 import PdfFileReader + +reader = PdfFileReader("example.pdf") + +for page in reader.pages: + if "/Annots" in page: + for annot in page["/Annots"]: + subtype = annot.getObject()["/Subtype"] + if subtype == "/Text": + print(annot.getObject()["/Contents"]) +``` + +## Highlights + +```python +from PyPDF2 import PdfFileReader + +reader = PdfFileReader("commented.pdf") + +for page in reader.pages: + if "/Annots" in page: + for annot in page["/Annots"]: + subtype = annot.getObject()["/Subtype"] + if subtype == "/Highlight": + coords = annot.getObject()["/QuadPoints"] + x1, y1, x2, y2, x3, y3, x4, y4 = coords +``` + +## Attachments + +```python +from PyPDF2 import PdfFileReader + +reader = PdfFileReader("example.pdf") + +attachments = {} +for page in reader.pages: + if "/Annots" in page: + for annotation in page["/Annots"]: + subtype = annot.getObject()["/Subtype"] + if subtype == "/FileAttachment": + fileobj = annotobj["/FS"] + attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].getData() +```