From a88683223f57d2f1003bf655c58ef75953405fe6 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 28 Apr 2022 13:30:22 +0200 Subject: [PATCH 1/2] MAINT: Split pdf module --- PyPDF2/__init__.py | 3 +- PyPDF2/_page.py | 727 +++++++ PyPDF2/_reader.py | 1356 +++++++++++++ PyPDF2/_security.py | 164 ++ PyPDF2/_writer.py | 1228 ++++++++++++ PyPDF2/generic.py | 144 ++ PyPDF2/merger.py | 10 +- PyPDF2/pdf.py | 3548 +--------------------------------- Tests/test_basic_features.py | 2 +- Tests/test_page.py | 3 + Tests/test_workflows.py | 2 +- 11 files changed, 3660 insertions(+), 3527 deletions(-) create mode 100644 PyPDF2/_page.py create mode 100644 PyPDF2/_reader.py create mode 100644 PyPDF2/_security.py create mode 100644 PyPDF2/_writer.py diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py index 8f0cf36cf..d7489672d 100644 --- a/PyPDF2/__init__.py +++ b/PyPDF2/__init__.py @@ -1,9 +1,10 @@ from PyPDF2 import pdf +from PyPDF2._reader import PdfFileReader from PyPDF2._version import __version__ +from PyPDF2._writer import PdfFileWriter from PyPDF2.merger import PdfFileMerger from PyPDF2.pagerange import PageRange, parse_filename_page_ranges from PyPDF2.papersizes import PaperSize -from PyPDF2.pdf import PdfFileReader, PdfFileWriter __all__ = [ "__version__", diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py new file mode 100644 index 000000000..0be1528a3 --- /dev/null +++ b/PyPDF2/_page.py @@ -0,0 +1,727 @@ +import math +import uuid + +from PyPDF2 import utils +from PyPDF2.constants import PageAttributes as PG +from PyPDF2.constants import Ressources as RES +from PyPDF2.errors import PageSizeNotDefinedError +from PyPDF2.generic import ( + ArrayObject, + ContentStream, + DictionaryObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + RectangleObject, + TextStringObject, +) +from PyPDF2.utils import b_, u_ + + +def getRectangle(self, name, defaults): + retval = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval is None: + for d in defaults: + retval = self.get(d) + if retval is not None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.getObject(retval) + retval = RectangleObject(retval) + setRectangle(self, name, retval) + return retval + + +def setRectangle(self, name, value): + if not isinstance(name, NameObject): + name = NameObject(name) + self[name] = value + + +def deleteRectangle(self, name): + del self[name] + + +def createRectangleAccessor(name, fallback): + return property( + lambda self: getRectangle(self, name, fallback), + lambda self, value: setRectangle(self, name, value), + lambda self: deleteRectangle(self, name), + ) + + +class PageObject(DictionaryObject): + """ + This class represents a single page within a PDF file. Typically this + object will be created by accessing the + :meth:`getPage()` method of the + :class:`PdfFileReader` class, but it is + also possible to create an empty page with the + :meth:`createBlankPage()` static method. + + :param pdf: PDF file the page belongs to. + :param indirectRef: Stores the original indirect reference to + this object in its source PDF + """ + + def __init__(self, pdf=None, indirectRef=None): + DictionaryObject.__init__(self) + self.pdf = pdf + self.indirectRef = indirectRef + + @staticmethod + def createBlankPage(pdf=None, width=None, height=None): + """ + Returns a new blank page. + If ``width`` or ``height`` is ``None``, try to get the page size + from the last page of *pdf*. + + :param pdf: PDF file the page belongs to + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default user + space units. + :return: the new blank page: + :rtype: :class:`PageObject` + :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains + no page + """ + page = PageObject(pdf) + + # Creates a new page (cf PDF Reference 7.7.3.3) + page.__setitem__(NameObject("/Type"), NameObject("/Page")) + page.__setitem__(NameObject("/Parent"), NullObject()) + page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) + if width is None or height is None: + if pdf is not None and pdf.getNumPages() > 0: + lastpage = pdf.getPage(pdf.getNumPages() - 1) + width = lastpage.mediaBox.getWidth() + height = lastpage.mediaBox.getHeight() + else: + raise PageSizeNotDefinedError() + page.__setitem__( + NameObject(PG.MEDIABOX), RectangleObject([0, 0, width, height]) + ) + + return page + + def rotateClockwise(self, angle): + """ + Rotates a page clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + if angle % 90 != 0: + raise ValueError("Rotation angle must be a multiple of 90") + self._rotate(angle) + return self + + def rotateCounterClockwise(self, angle): + """ + Rotates a page counter-clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + if angle % 90 != 0: + raise ValueError("Rotation angle must be a multiple of 90") + self._rotate(-angle) + return self + + def _rotate(self, angle): + rotate_obj = self.get("/Rotate", 0) + current_angle = ( + rotate_obj if isinstance(rotate_obj, int) else rotate_obj.getObject() + ) + self[NameObject("/Rotate")] = NumberObject(current_angle + angle) + + @staticmethod + def _mergeResources(res1, res2, resource): + new_res = DictionaryObject() + new_res.update(res1.get(resource, DictionaryObject()).getObject()) + page2res = res2.get(resource, DictionaryObject()).getObject() + rename_res = {} + for key in list(page2res.keys()): + if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): + newname = NameObject(key + str(uuid.uuid4())) + rename_res[key] = newname + new_res[newname] = page2res[key] + elif key not in new_res: + new_res[key] = page2res.raw_get(key) + return new_res, rename_res + + @staticmethod + def _contentStreamRename(stream, rename, pdf): + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands, _operator in stream.operations: + if isinstance(operands, list): + for i in range(len(operands)): + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + elif isinstance(operands, dict): + for i in operands: + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + else: + raise KeyError("type of operands is %s" % type(operands)) + return stream + + @staticmethod + def _pushPopGS(contents, pdf): + # adds a graphics state "push" and "pop" to the beginning and end + # of a content stream. This isolates it from changes such as + # transformation matricies. + stream = ContentStream(contents, pdf) + stream.operations.insert(0, [[], "q"]) + stream.operations.append([[], "Q"]) + return stream + + @staticmethod + def _addTransformationMatrix(contents, pdf, ctm): + # adds transformation matrix at the beginning of the given + # contents stream. + a, b, c, d, e, f = ctm + contents = ContentStream(contents, pdf) + contents.operations.insert( + 0, + [ + [ + FloatObject(a), + FloatObject(b), + FloatObject(c), + FloatObject(d), + FloatObject(e), + FloatObject(f), + ], + " cm", + ], + ) + return contents + + def getContents(self): + """ + Accesses the page contents. + + :return: the ``/Contents`` object, or ``None`` if it doesn't exist. + ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 + """ + if "/Contents" in self: + return self["/Contents"].getObject() + else: + return None + + def mergePage(self, page2): + """ + Merges the content streams of two pages into one. Resource references + (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + of this page are not altered. The parameter page's content stream will + be added to the end of this page's content stream, meaning that it will + be drawn after, or "on top" of this page. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + """ + self._mergePage(page2) + + def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + + new_resources = DictionaryObject() + rename = {} + original_resources = self[PG.RESOURCES].getObject() + page2resources = page2[PG.RESOURCES].getObject() + new_annots = ArrayObject() + + for page in (self, page2): + if PG.ANNOTS in page: + annots = page[PG.ANNOTS] + if isinstance(annots, ArrayObject): + for ref in annots: + new_annots.append(ref) + + for res in ( + "/ExtGState", + RES.FONT, + RES.XOBJECT, + RES.COLOR_SPACE, + "/Pattern", + "/Shading", + "/Properties", + ): + new, newrename = PageObject._mergeResources( + original_resources, page2resources, res + ) + if new: + new_resources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets. + new_resources[NameObject(RES.PROCSET)] = ArrayObject( + frozenset( + original_resources.get(RES.PROCSET, ArrayObject()).getObject() + ).union( + frozenset(page2resources.get(RES.PROCSET, ArrayObject()).getObject()) + ) + ) + + new_content_array = ArrayObject() + + original_content = self.getContents() + if original_content is not None: + new_content_array.append(PageObject._pushPopGS(original_content, self.pdf)) + + page2content = page2.getContents() + if page2content is not None: + page2content = ContentStream(page2content, self.pdf) + page2content.operations.insert( + 0, + [ + map( + FloatObject, + [ + page2.trimBox.getLowerLeft_x(), + page2.trimBox.getLowerLeft_y(), + page2.trimBox.getWidth(), + page2.trimBox.getHeight(), + ], + ), + "re", + ], + ) + page2content.operations.insert(1, [[], "W"]) + page2content.operations.insert(2, [[], "n"]) + if page2transformation is not None: + page2content = page2transformation(page2content) + page2content = PageObject._contentStreamRename( + page2content, rename, self.pdf + ) + page2content = PageObject._pushPopGS(page2content, self.pdf) + new_content_array.append(page2content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + corners1 = [ + self.mediaBox.getLowerLeft_x().as_numeric(), + self.mediaBox.getLowerLeft_y().as_numeric(), + self.mediaBox.getUpperRight_x().as_numeric(), + self.mediaBox.getUpperRight_y().as_numeric(), + ] + corners2 = [ + page2.mediaBox.getLowerLeft_x().as_numeric(), + page2.mediaBox.getLowerLeft_y().as_numeric(), + page2.mediaBox.getUpperLeft_x().as_numeric(), + page2.mediaBox.getUpperLeft_y().as_numeric(), + page2.mediaBox.getUpperRight_x().as_numeric(), + page2.mediaBox.getUpperRight_y().as_numeric(), + page2.mediaBox.getLowerRight_x().as_numeric(), + page2.mediaBox.getLowerRight_y().as_numeric(), + ] + if ctm is not None: + ctm = [float(x) for x in ctm] + new_x = [ + ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] + for i in range(0, 8, 2) + ] + new_y = [ + ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] + for i in range(0, 8, 2) + ] + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = [min(new_x), min(new_y)] + upperright = [max(new_x), max(new_y)] + lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])] + upperright = [ + max(corners1[2], upperright[0]), + max(corners1[3], upperright[1]), + ] + + self.mediaBox.setLowerLeft(lowerleft) + self.mediaBox.setUpperRight(upperright) + + self[NameObject("/Contents")] = ContentStream(new_content_array, self.pdf) + self[NameObject(PG.RESOURCES)] = new_resources + self[NameObject(PG.ANNOTS)] = new_annots + + def mergeTransformedPage(self, page2, ctm, expand=False): + """ + This is similar to mergePage, but a transformation matrix is + applied to the merged stream. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param tuple ctm: a 6-element tuple containing the operands of the + transformation matrix + :param bool expand: Whether the page should be expanded to fit the dimensions + of the page to be merged. + """ + self._mergePage( + page2, + lambda page2Content: PageObject._addTransformationMatrix( + page2Content, page2.pdf, ctm + ), + ctm, + expand, + ) + + def mergeScaledPage(self, page2, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is scaled + by appling a transformation matrix. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + # CTM to scale : [ sx 0 0 sy 0 0 ] + return self.mergeTransformedPage(page2, [scale, 0, 0, scale, 0, 0], expand) + + def mergeRotatedPage(self, page2, rotation, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + return self.mergeTransformedPage( + page2, + [ + math.cos(rotation), + math.sin(rotation), + -math.sin(rotation), + math.cos(rotation), + 0, + 0, + ], + expand, + ) + + def mergeTranslatedPage(self, page2, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + return self.mergeTransformedPage(page2, [1, 0, 0, 1, tx, ty], expand) + + def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and translated by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], [0, 1, 0], [-tx, -ty, 1]] + rotation = math.radians(rotation) + rotating = [ + [math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1], + ] + rtranslation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] + ctm = utils.matrixMultiply(translation, rotating) + ctm = utils.matrixMultiply(ctm, rtranslation) + + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand, + ) + + def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + rotating = [ + [math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1], + ] + scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand, + ) + + def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float scale: The scaling factor + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] + scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] + ctm = utils.matrixMultiply(scaling, translation) + + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand, + ) + + def mergeRotatedScaledTranslatedPage( + self, page2, rotation, scale, tx, ty, expand=False + ): + """ + This is similar to mergePage, but the stream to be merged is translated, + rotated and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + translation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] + rotation = math.radians(rotation) + rotating = [ + [math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1], + ] + scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + ctm = utils.matrixMultiply(ctm, translation) + + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand, + ) + + def addTransformation(self, ctm): + """ + Applies a transformation matrix to the page. + + :param tuple ctm: A 6-element tuple containing the operands of the + transformation matrix. + """ + original_content = self.getContents() + if original_content is not None: + new_content = PageObject._addTransformationMatrix( + original_content, self.pdf, ctm + ) + new_content = PageObject._pushPopGS(new_content, self.pdf) + self[NameObject("/Contents")] = new_content + + def scale(self, sx, sy): + """ + Scales a page by the given factors by appling a transformation + matrix to its content and updating the page size. + + :param float sx: The scaling factor on horizontal axis. + :param float sy: The scaling factor on vertical axis. + """ + self.addTransformation([sx, 0, 0, sy, 0, 0]) + self.mediaBox = RectangleObject( + [ + float(self.mediaBox.getLowerLeft_x()) * sx, + float(self.mediaBox.getLowerLeft_y()) * sy, + float(self.mediaBox.getUpperRight_x()) * sx, + float(self.mediaBox.getUpperRight_y()) * sy, + ] + ) + if "/VP" in self: + viewport = self["/VP"] + if isinstance(viewport, ArrayObject): + bbox = viewport[0]["/BBox"] + else: + bbox = viewport["/BBox"] + scaled_bbox = RectangleObject( + [ + float(bbox[0]) * sx, + float(bbox[1]) * sy, + float(bbox[2]) * sx, + float(bbox[3]) * sy, + ] + ) + if isinstance(viewport, ArrayObject): + self[NameObject("/VP")][NumberObject(0)][ + NameObject("/BBox") + ] = scaled_bbox + else: + self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox + + def scaleBy(self, factor): + """ + Scales a page by the given factor by appling a transformation + matrix to its content and updating the page size. + + :param float factor: The scaling factor (for both X and Y axis). + """ + self.scale(factor, factor) + + def scaleTo(self, width, height): + """ + Scales a page to the specified dimentions by appling a + transformation matrix to its content and updating the page size. + + :param float width: The new width. + :param float height: The new heigth. + """ + sx = width / float( + self.mediaBox.getUpperRight_x() - self.mediaBox.getLowerLeft_x() + ) + sy = height / float( + self.mediaBox.getUpperRight_y() - self.mediaBox.getLowerLeft_y() + ) + self.scale(sx, sy) + + def compressContentStreams(self): + """ + Compresses the size of this page by joining all content streams and + applying a FlateDecode filter. + + However, it is possible that this function will perform no action if + content stream compression becomes "automatic" for some reason. + """ + content = self.getContents() + if content is not None: + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + self[NameObject("/Contents")] = content.flateEncode() + + def extractText(self, Tj_sep="", TJ_sep=""): + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. This works well for some PDF + files, but poorly for others, depending on the generator used. This will + be refined in the future. Do not rely on the order of text coming out of + this function, as it will change if this function is made more + sophisticated. + + :return: a unicode string object. + """ + text = u_("") + content = self["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + for operands, operator in content.operations: + if operator == b_("Tj"): + _text = operands[0] + if isinstance(_text, TextStringObject): + text += Tj_sep + text += _text + text += "\n" + elif operator == b_("T*"): + text += "\n" + elif operator == b_("'"): + text += "\n" + _text = operands[0] + if isinstance(_text, TextStringObject): + text += operands[0] + elif operator == b_('"'): + _text = operands[2] + if isinstance(_text, TextStringObject): + text += "\n" + text += _text + elif operator == b_("TJ"): + for i in operands[0]: + if isinstance(i, TextStringObject): + text += TJ_sep + text += i + elif isinstance(i, NumberObject): + # a positive value decreases and the negative value increases + # space + if int(i) < 0: + if len(text) == 0 or text[-1] != " ": + text += " " + else: + if len(text) > 1 and text[-1] == " ": + text = text[:-1] + text += "\n" + return text + + mediaBox = createRectangleAccessor(PG.MEDIABOX, ()) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the boundaries of the physical medium on which the page is + intended to be displayed or printed. + """ + + cropBox = createRectangleAccessor("/CropBox", (PG.MEDIABOX,)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the visible region of default user space. When the page is + displayed or printed, its contents are to be clipped (cropped) to this + rectangle and then imposed on the output medium in some + implementation-defined manner. Default value: same as :attr:`mediaBox`. + """ + + bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the region to which the contents of the page should be clipped + when output in a production enviroment. + """ + + trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the intended dimensions of the finished page after trimming. + """ + + artBox = createRectangleAccessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the extent of the page's meaningful content as intended by the + page's creator. + """ diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py new file mode 100644 index 000000000..d73977039 --- /dev/null +++ b/PyPDF2/_reader.py @@ -0,0 +1,1356 @@ +import struct +import sys +import warnings +from hashlib import md5 +from sys import version_info + +from PyPDF2 import utils +from PyPDF2._page import PageObject +from PyPDF2._security import _alg33_1, _alg34, _alg35 +from PyPDF2.constants import CatalogAttributes as CA +from PyPDF2.constants import Core as CO +from PyPDF2.constants import PageAttributes as PG +from PyPDF2.constants import PagesAttributes as PA +from PyPDF2.constants import StreamAttributes as SA +from PyPDF2.constants import TrailerKeys as TK +from PyPDF2.errors import PdfReadError, PdfReadWarning, PdfStreamError +from PyPDF2.generic import ( + ArrayObject, + BooleanObject, + ByteStringObject, + Destination, + DictionaryObject, + Field, + IndirectObject, + NameObject, + NullObject, + NumberObject, + StreamObject, + TextStringObject, + createStringObject, + readNonWhitespace, + readObject, +) +from PyPDF2.utils import ( + ConvertFunctionsToVirtualList, + b_, + formatWarning, + isString, + readUntilWhitespace, +) + +if version_info < (3, 0): + from cStringIO import StringIO +else: + from io import StringIO +if version_info < (3, 0): + BytesIO = StringIO +else: + from io import BytesIO + + +def convertToInt(d, size): + if size > 8: + raise PdfReadError("invalid size in convertToInt") + d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) + d = d[-8:] + return struct.unpack(">q", d)[0] + + +class DocumentInformation(DictionaryObject): + """ + A class representing the basic document metadata provided in a PDF File. + This class is accessible through + :meth:`.getDocumentInfo()` + + All text properties of the document metadata have + *two* properties, eg. author and author_raw. The non-raw property will + always return a ``TextStringObject``, making it ideal for a case where + the metadata is being displayed. The raw property can sometimes return + a ``ByteStringObject``, if PyPDF2 was unable to decode the string's + text encoding; this requires additional safety in the caller and + therefore is not as commonly accessed. + """ + + def __init__(self): + DictionaryObject.__init__(self) + + def getText(self, key): + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + @property + def title(self): + """Read-only property accessing the document's **title**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the title is not specified.""" + return ( + self.getText("/Title") or self.get("/Title").getObject() + if self.get("/Title") + else None + ) + + @property + def title_raw(self): + """The "raw" version of title; can return a ``ByteStringObject``.""" + return self.get("/Title") + + @property + def author(self): + """Read-only property accessing the document's **author**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the author is not specified.""" + return self.getText("/Author") + + @property + def author_raw(self): + """The "raw" version of author; can return a ``ByteStringObject``.""" + return self.get("/Author") + + @property + def subject(self): + """Read-only property accessing the document's **subject**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the subject is not specified.""" + return self.getText("/Subject") + + @property + def subject_raw(self): + """The "raw" version of subject; can return a ``ByteStringObject``.""" + return self.get("/Subject") + + @property + def creator(self): + """Read-only property accessing the document's **creator**. If the + document was converted to PDF from another format, this is the name of the + application (e.g. OpenOffice) that created the original document from + which it was converted. Returns a unicode string (``TextStringObject``) + or ``None`` if the creator is not specified.""" + return self.getText("/Creator") + + @property + def creator_raw(self): + """The "raw" version of creator; can return a ``ByteStringObject``.""" + return self.get("/Creator") + + @property + def producer(self): + """Read-only property accessing the document's **producer**. + If the document was converted to PDF from another format, this is + the name of the application (for example, OSX Quartz) that converted + it to PDF. Returns a unicode string (``TextStringObject``) + or ``None`` if the producer is not specified.""" + return self.getText("/Producer") + + @property + def producer_raw(self): + """The "raw" version of producer; can return a ``ByteStringObject``.""" + return self.get("/Producer") + + +class PdfFileReader(object): + """ + Initializes a PdfFileReader object. This operation can take some time, as + the PDF stream's cross-reference tables are read into memory. + + :param stream: A File object or an object that supports the standard read + and seek methods similar to a File object. Could also be a + string representing a path to a PDF file. + :param bool strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``True``. + :param warndest: Destination for logging warnings (defaults to + ``sys.stderr``). + :param bool overwriteWarnings: Determines whether to override Python's + ``warnings.py`` module with a custom implementation (defaults to + ``True``). + """ + + def __init__(self, stream, strict=True, warndest=None, overwriteWarnings=True): + if overwriteWarnings: + # Have to dynamically override the default showwarning since there are no + # public methods that specify the 'file' parameter + def _showwarning( + message, category, filename, lineno, file=warndest, line=None + ): + if file is None: + file = sys.stderr + try: + # It is possible for sys.stderr to be defined as None, most commonly in the case that the script + # is being run vida pythonw.exe on Windows. In this case, just swallow the warning. + # See also https://docs.python.org/3/library/sys.html# sys.__stderr__ + if file is not None: + file.write( + formatWarning(message, category, filename, lineno, line) + ) + except IOError: + pass + + warnings.showwarning = _showwarning + self.strict = strict + self.flattenedPages = None + self.resolvedObjects = {} + self.xrefIndex = 0 + self._pageId2Num = None # map page IndirectRef number to Page Number + if hasattr(stream, "mode") and "b" not in stream.mode: + warnings.warn( + "PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", + PdfReadWarning, + ) + if isString(stream): + with open(stream, "rb") as fileobj: + stream = BytesIO(b_(fileobj.read())) + self.read(stream) + self.stream = stream + + self._override_encryption = False + + def getDocumentInfo(self): + """ + Retrieves the PDF file's document information dictionary, if it exists. + Note that some PDF files use metadata streams instead of docinfo + dictionaries, and these metadata streams will not be accessed by this + function. + + :return: the document information of this PDF file + :rtype: :class:`DocumentInformation` or ``None`` if none exists. + """ + if TK.INFO not in self.trailer: + return None + obj = self.trailer[TK.INFO] + retval = DocumentInformation() + retval.update(obj) + return retval + + @property + def documentInfo(self): + """Read-only property that accesses the :meth:`getDocumentInfo()` function.""" + return self.getDocumentInfo() + + def getXmpMetadata(self): + """ + Retrieves XMP (Extensible Metadata Platform) data from the PDF document + root. + + :return: a :class:`XmpInformation` + instance that can be used to access XMP metadata from the document. + :rtype: :class:`XmpInformation` or + ``None`` if no metadata was found on the document root. + """ + try: + self._override_encryption = True + return self.trailer[TK.ROOT].getXmpMetadata() + finally: + self._override_encryption = False + + @property + def xmpMetadata(self): + """ + Read-only property that accesses the + :meth:`getXmpMetadata()` function. + """ + return self.getXmpMetadata() + + def getNumPages(self): + """ + Calculates the number of pages in this PDF file. + + :return: number of pages + :rtype: int + :raises PdfReadError: if file is encrypted and restrictions prevent + this action. + """ + + # Flattened pages will not work on an Encrypted PDF; + # the PDF file's page count is used in this case. Otherwise, + # the original method (flattened page count) is used. + if self.isEncrypted: + try: + self._override_encryption = True + self.decrypt("") + return self.trailer[TK.ROOT]["/Pages"]["/Count"] + except Exception: + raise PdfReadError("File has not been decrypted") + finally: + self._override_encryption = False + else: + if self.flattenedPages is None: + self._flatten() + return len(self.flattenedPages) + + @property + def numPages(self): + """ + Read-only property that accesses the + :meth:`getNumPages()` function. + """ + return self.getNumPages() + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: a :class:`PageObject` instance. + :rtype: :class:`PageObject` + """ + # ensure that we're not trying to access an encrypted PDF + # assert not self.trailer.has_key(TK.ENCRYPT) + if self.flattenedPages is None: + self._flatten() + return self.flattenedPages[pageNumber] + + @property + def namedDestinations(self): + """ + Read-only property that accesses the + :meth:`getNamedDestinations()` function. + """ + return self.getNamedDestinations() + + # A select group of relevant field attributes. For the complete list, + # see section 8.6.2 of the PDF 1.7 reference. + + def getFields(self, tree=None, retval=None, fileobj=None): + """ + Extracts field data if this PDF contains interactive form fields. + The *tree* and *retval* parameters are for recursive use. + + :param fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. + :return: A dictionary where each key is a field name, and each + value is a :class:`Field` object. By + default, the mapping name is used for keys. + :rtype: dict, or ``None`` if form data could not be located. + """ + field_attributes = { + "/FT": "Field Type", + PA.PARENT: "Parent", + "/T": "Field Name", + "/TU": "Alternate Field Name", + "/TM": "Mapping Name", + "/Ff": "Field Flags", + "/V": "Value", + "/DV": "Default Value", + } + if retval is None: + retval = {} + catalog = self.trailer[TK.ROOT] + # get the AcroForm tree + if "/AcroForm" in catalog: + tree = catalog["/AcroForm"] + else: + return None + if tree is None: + return retval + + self._checkKids(tree, retval, fileobj) + for attr in field_attributes: + if attr in tree: + # Tree is a field + self._buildField(tree, retval, fileobj, field_attributes) + break + + if "/Fields" in tree: + fields = tree["/Fields"] + for f in fields: + field = f.getObject() + self._buildField(field, retval, fileobj, field_attributes) + + return retval + + def _buildField(self, field, retval, fileobj, fieldAttributes): + self._checkKids(field, retval, fileobj) + try: + key = field["/TM"] + except KeyError: + try: + key = field["/T"] + except KeyError: + # Ignore no-name field for now + return + if fileobj: + self._writeField(fileobj, field, fieldAttributes) + fileobj.write("\n") + retval[key] = Field(field) + + def _checkKids(self, tree, retval, fileobj): + if PA.KIDS in tree: + # recurse down the tree + for kid in tree[PA.KIDS]: + self.getFields(kid.getObject(), retval, fileobj) + + def _writeField(self, fileobj, field, fieldAttributes): + order = ["/TM", "/T", "/FT", PA.PARENT, "/TU", "/Ff", "/V", "/DV"] + for attr in order: + attr_name = fieldAttributes[attr] + try: + if attr == "/FT": + # Make the field type value more clear + types = { + "/Btn": "Button", + "/Tx": "Text", + "/Ch": "Choice", + "/Sig": "Signature", + } + if field[attr] in types: + fileobj.write(attr_name + ": " + types[field[attr]] + "\n") + elif attr == PA.PARENT: + # Let's just write the name of the parent + try: + name = field[PA.PARENT]["/TM"] + except KeyError: + name = field[PA.PARENT]["/T"] + fileobj.write(attr_name + ": " + name + "\n") + else: + fileobj.write(attr_name + ": " + str(field[attr]) + "\n") + except KeyError: + # Field attribute is N/A or unknown, so don't write anything + pass + + def getFormTextFields(self): + """Retrieves form fields from the document with textual data (inputs, dropdowns)""" + # Retrieve document form fields + formfields = self.getFields() + if formfields is None: + return {} + return { + formfields[field]["/T"]: formfields[field].get("/V") + for field in formfields + if formfields[field].get("/FT") == "/Tx" + } + + def getNamedDestinations(self, tree=None, retval=None): + """ + Retrieves the named destinations present in the document. + + :return: a dictionary which maps names to + :class:`Destinations`. + :rtype: dict + """ + if retval is None: + retval = {} + catalog = self.trailer[TK.ROOT] + + # get the name tree + if CA.DESTS in catalog: + tree = catalog[CA.DESTS] + elif CA.NAMES in catalog: + names = catalog[CA.NAMES] + if CA.DESTS in names: + tree = names[CA.DESTS] + + if tree is None: + return retval + + if PA.KIDS in tree: + # recurse down the tree + for kid in tree[PA.KIDS]: + self.getNamedDestinations(kid.getObject(), retval) + + if CA.NAMES in tree: + names = tree[CA.NAMES] + for i in range(0, len(names), 2): + key = names[i].getObject() + val = names[i + 1].getObject() + if isinstance(val, DictionaryObject) and "/D" in val: + val = val["/D"] + dest = self._buildDestination(key, val) + if dest is not None: + retval[key] = dest + + return retval + + @property + def outlines(self): + """ + Read-only property that accesses the + :meth:`getOutlines()` function. + """ + return self.getOutlines() + + def getOutlines(self, node=None, outlines=None): + """ + Retrieves the document outline present in the document. + + :return: a nested list of :class:`Destinations`. + """ + if outlines is None: + outlines = [] + catalog = self.trailer[TK.ROOT] + + # get the outline dictionary and named destinations + if CO.OUTLINES in catalog: + try: + lines = catalog[CO.OUTLINES] + except PdfReadError: + # this occurs if the /Outlines object reference is incorrect + # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf + # so continue to load the file without the Bookmarks + return outlines + + if "/First" in lines: + node = lines["/First"] + self._namedDests = self.getNamedDestinations() + + if node is None: + return outlines + + # see if there are any more outlines + while True: + outline = self._buildOutline(node) + if outline: + outlines.append(outline) + + # check for sub-outlines + if "/First" in node: + sub_outlines = [] + self.getOutlines(node["/First"], sub_outlines) + if sub_outlines: + outlines.append(sub_outlines) + + if "/Next" not in node: + break + node = node["/Next"] + + return outlines + + def _getPageNumberByIndirect(self, indirectRef): + """Generate _pageId2Num""" + if self._pageId2Num is None: + id2num = {} + for i, x in enumerate(self.pages): + id2num[x.indirectRef.idnum] = i + self._pageId2Num = id2num + + if isinstance(indirectRef, int): + idnum = indirectRef + else: + idnum = indirectRef.idnum + + ret = self._pageId2Num.get(idnum, -1) + return ret + + def getPageNumber(self, page): + """ + Retrieve page number of a given PageObject + + :param PageObject page: The page to get page number. Should be + an instance of :class:`PageObject` + :return: the page number or -1 if page not found + :rtype: int + """ + indirect_ref = page.indirectRef + ret = self._getPageNumberByIndirect(indirect_ref) + return ret + + def getDestinationPageNumber(self, destination): + """ + Retrieve page number of a given Destination object + + :param Destination destination: The destination to get page number. + Should be an instance of + :class:`Destination` + :return: the page number or -1 if page not found + :rtype: int + """ + indirect_ref = destination.page + ret = self._getPageNumberByIndirect(indirect_ref) + return ret + + def _buildDestination(self, title, array): + page, typ = array[0:2] + array = array[2:] + return Destination(title, page, typ, *array) + + def _buildOutline(self, node): + dest, title, outline = None, None, None + + if "/A" in node and "/Title" in node: + # Action, section 8.5 (only type GoTo supported) + title = node["/Title"] + action = node["/A"] + if action["/S"] == "/GoTo": + dest = action["/D"] + elif "/Dest" in node and "/Title" in node: + # Destination, section 8.2.1 + title = node["/Title"] + dest = node["/Dest"] + + # if destination found, then create outline + if dest: + if isinstance(dest, ArrayObject): + outline = self._buildDestination(title, dest) + elif isString(dest) and dest in self._namedDests: + outline = self._namedDests[dest] + outline[NameObject("/Title")] = title + else: + raise PdfReadError("Unexpected destination %r" % dest) + return outline + + @property + def pages(self): + """ + Read-only property that emulates a list based upon the + :meth:`getNumPages()` and + :meth:`getPage()` methods. + """ + return ConvertFunctionsToVirtualList(self.getNumPages, self.getPage) + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()` + for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer[TK.ROOT]["/PageLayout"] + except KeyError: + return None + + @property + def pageLayout(self): + """Read-only property accessing the + :meth:`getPageLayout()` method.""" + return self.getPageLayout() + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()` + for a description of valid modes. + + :return: Page mode currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer[TK.ROOT]["/PageMode"] + except KeyError: + return None + + @property + def pageMode(self): + """Read-only property accessing the + :meth:`getPageMode()` method.""" + return self.getPageMode() + + def _flatten(self, pages=None, inherit=None, indirectRef=None): + inheritablePageAttributes = ( + NameObject(PG.RESOURCES), + NameObject(PG.MEDIABOX), + NameObject(PG.CROPBOX), + NameObject(PG.ROTATE), + ) + if inherit is None: + inherit = {} + if pages is None: + # Fix issue 327: set flattenedPages attribute only for + # decrypted file + catalog = self.trailer[TK.ROOT].getObject() + pages = catalog["/Pages"].getObject() + self.flattenedPages = [] + + t = "/Pages" + if PA.TYPE in pages: + t = pages[PA.TYPE] + + if t == "/Pages": + for attr in inheritablePageAttributes: + if attr in pages: + inherit[attr] = pages[attr] + for page in pages[PA.KIDS]: + addt = {} + if isinstance(page, IndirectObject): + addt["indirectRef"] = page + self._flatten(page.getObject(), inherit, **addt) + elif t == "/Page": + for attr, value in list(inherit.items()): + # if the page has it's own value, it does not inherit the + # parent's value: + if attr not in pages: + pages[attr] = value + page_obj = PageObject(self, indirectRef) + page_obj.update(pages) + self.flattenedPages.append(page_obj) + + def _getObjectFromStream(self, indirectReference): + # indirect reference to object in object stream + # read the entire object stream into memory + stmnum, idx = self.xref_objStm[indirectReference.idnum] + obj_stm = IndirectObject(stmnum, 0, self).getObject() + # This is an xref to a stream, so its type better be a stream + assert obj_stm["/Type"] == "/ObjStm" + # /N is the number of indirect objects in the stream + assert idx < obj_stm["/N"] + stream_data = BytesIO(b_(obj_stm.getData())) + for i in range(obj_stm["/N"]): + readNonWhitespace(stream_data) + stream_data.seek(-1, 1) + objnum = NumberObject.readFromStream(stream_data) + readNonWhitespace(stream_data) + stream_data.seek(-1, 1) + offset = NumberObject.readFromStream(stream_data) + readNonWhitespace(stream_data) + stream_data.seek(-1, 1) + if objnum != indirectReference.idnum: + # We're only interested in one object + continue + if self.strict and idx != i: + raise PdfReadError("Object is in wrong index.") + stream_data.seek(obj_stm["/First"] + offset, 0) + try: + obj = readObject(stream_data, self) + except PdfStreamError as e: + # Stream object cannot be read. Normally, a critical error, but + # Adobe Reader doesn't complain, so continue (in strict mode?) + e = sys.exc_info()[1] + warnings.warn( + "Invalid stream (index %d) within object %d %d: %s" + % (i, indirectReference.idnum, indirectReference.generation, e), + PdfReadWarning, + ) + + if self.strict: + raise PdfReadError("Can't read object stream: %s" % e) + # Replace with null. Hopefully it's nothing important. + obj = NullObject() + return obj + + if self.strict: + raise PdfReadError("This is a fatal error in strict mode.") + return NullObject() + + def getObject(self, indirectReference): + retval = self.cacheGetIndirectObject( + indirectReference.generation, indirectReference.idnum + ) + if retval is not None: + return retval + if ( + indirectReference.generation == 0 + and indirectReference.idnum in self.xref_objStm + ): + retval = self._getObjectFromStream(indirectReference) + elif ( + indirectReference.generation in self.xref + and indirectReference.idnum in self.xref[indirectReference.generation] + ): + start = self.xref[indirectReference.generation][indirectReference.idnum] + self.stream.seek(start, 0) + idnum, generation = self.readObjectHeader(self.stream) + if idnum != indirectReference.idnum and self.xrefIndex: + # Xref table probably had bad indexes due to not being zero-indexed + if self.strict: + raise PdfReadError( + "Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." + % ( + indirectReference.idnum, + indirectReference.generation, + idnum, + generation, + ) + ) + else: + pass # xref table is corrected in non-strict mode + elif idnum != indirectReference.idnum and self.strict: + # some other problem + raise PdfReadError( + "Expected object ID (%d %d) does not match actual (%d %d)." + % ( + indirectReference.idnum, + indirectReference.generation, + idnum, + generation, + ) + ) + if self.strict: + assert generation == indirectReference.generation + retval = readObject(self.stream, self) + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self.isEncrypted: + # if we don't have the encryption key: + if not hasattr(self, "_decryption_key"): + raise PdfReadError("file has not been decrypted") + # otherwise, decrypt here... + pack1 = struct.pack("= 3 + if self.strict and len(entry_sizes) > 3: + raise PdfReadError("Too many entry sizes: %s" % entry_sizes) + + def get_entry(i): + # Reads the correct number of bytes for each entry. See the + # discussion of the W parameter in PDF spec table 17. + if entry_sizes[i] > 0: + d = stream_data.read(entry_sizes[i]) + return convertToInt(d, entry_sizes[i]) + + # PDF Spec Table 17: A value of zero for an element in the + # W array indicates...the default value shall be used + if i == 0: + return 1 # First value defaults to 1 + else: + return 0 + + def used_before(num, generation): + # We move backwards through the xrefs, don't replace any. + return num in self.xref.get(generation, []) or num in self.xref_objStm + + # Iterate through each subsection + self._read_xref_subsections(idx_pairs, get_entry, used_before) + return xrefstream + + @staticmethod + def _get_xref_issues(stream, startxref): + """Returns an int which indicates an issue. 0 means there is no issue.""" + stream.seek(startxref - 1, 0) # -1 to check character before + line = stream.read(1) + if line not in b_("\r\n \t"): + return 1 + line = stream.read(4) + if line != b_("xref"): + # not an xref so check if it is an XREF object + line = b_("") + while line in b_("0123456789 \t"): + line = stream.read(1) + if line == b_(""): + return 2 + line += stream.read(2) # 1 char already read, +2 to check "obj" + if line.lower() != b_("obj"): + return 3 + while stream.read(1) in b_(" \t\r\n"): + pass + line = stream.read(256) # check that it is xref obj + if b_("/xref") not in line.lower(): + return 4 + return 0 + + def _rebuild_xref_table(self, stream): + self.xref = {} + stream.seek(0, 0) + f_ = stream.read(-1) + import re + + for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_): + idnum = int(m.group(1)) + generation = int(m.group(2)) + if generation not in self.xref: + self.xref[generation] = {} + self.xref[generation][idnum] = m.start(1) + trailer_pos = f_.rfind(b"trailer") - len(f_) + 7 + stream.seek(trailer_pos, 2) + # code below duplicated + readNonWhitespace(stream) + stream.seek(-1, 1) + new_trailer = readObject(stream, self) + for key, value in list(new_trailer.items()): + if key not in self.trailer: + self.trailer[key] = value + + def _read_xref_subsections(self, idx_pairs, getEntry, used_before): + last_end = 0 + for start, size in self._pairs(idx_pairs): + # The subsections must increase + assert start >= last_end + last_end = start + size + for num in range(start, start + size): + # The first entry is the type + xref_type = getEntry(0) + # The rest of the elements depend on the xref_type + if xref_type == 0: + # linked list of free objects + next_free_object = getEntry(1) # noqa: F841 + next_generation = getEntry(2) # noqa: F841 + elif xref_type == 1: + # objects that are in use but are not compressed + byte_offset = getEntry(1) + generation = getEntry(2) + if generation not in self.xref: + self.xref[generation] = {} + if not used_before(num, generation): + self.xref[generation][num] = byte_offset + elif xref_type == 2: + # compressed objects + objstr_num = getEntry(1) + obstr_idx = getEntry(2) + generation = 0 # PDF spec table 18, generation is 0 + if not used_before(num, generation): + self.xref_objStm[num] = (objstr_num, obstr_idx) + elif self.strict: + raise PdfReadError("Unknown xref type: %s" % xref_type) + + def _zeroXref(self, generation): + self.xref[generation] = { + k - self.xrefIndex: v for (k, v) in list(self.xref[generation].items()) + } + + def _pairs(self, array): + i = 0 + while True: + yield array[i], array[i + 1] + i += 2 + if (i + 1) >= len(array): + break + + def readNextEndLine(self, stream, limit_offset=0): + line_parts = [] + while True: + # Prevent infinite loops in malformed PDFs + if stream.tell() == 0 or stream.tell() == limit_offset: + raise PdfReadError("Could not read malformed PDF file") + x = stream.read(1) + if stream.tell() < 2: + raise PdfReadError("EOL marker not found") + stream.seek(-2, 1) + if x == b_("\n") or x == b_("\r"): ## \n = LF; \r = CR + crlf = False + while x == b_("\n") or x == b_("\r"): + x = stream.read(1) + if x == b_("\n") or x == b_("\r"): # account for CR+LF + stream.seek(-1, 1) + crlf = True + if stream.tell() < 2: + raise PdfReadError("EOL marker not found") + stream.seek(-2, 1) + stream.seek( + 2 if crlf else 1, 1 + ) # if using CR+LF, go back 2 bytes, else 1 + break + else: + line_parts.append(x) + line_parts.reverse() + return b"".join(line_parts) + + def decrypt(self, password): + """ + When using an encrypted / secured PDF file with the PDF Standard + encryption handler, this function will allow the file to be decrypted. + It checks the given password against the document's user password and + owner password, and then stores the resulting decryption key if either + password is correct. + + It does not matter which password was matched. Both passwords provide + the correct decryption key that will allow the document to be used with + this library. + + :param str password: The password to match. + :return: ``0`` if the password failed, ``1`` if the password matched the user + password, and ``2`` if the password matched the owner password. + :rtype: int + :raises NotImplementedError: if document uses an unsupported encryption + method. + """ + + self._override_encryption = True + try: + return self._decrypt(password) + finally: + self._override_encryption = False + + def decode_permissions(self, permissions_code): + # Takes the permissions as an integer, returns the allowed access + permissions = {} + permissions["print"] = permissions_code & (1 << 3 - 1) != 0 # bit 3 + permissions["modify"] = permissions_code & (1 << 4 - 1) != 0 # bit 4 + permissions["copy"] = permissions_code & (1 << 5 - 1) != 0 # bit 5 + permissions["annotations"] = permissions_code & (1 << 6 - 1) != 0 # bit 6 + permissions["forms"] = permissions_code & (1 << 9 - 1) != 0 # bit 9 + permissions["accessability"] = permissions_code & (1 << 10 - 1) != 0 # bit 10 + permissions["assemble"] = permissions_code & (1 << 11 - 1) != 0 # bit 11 + permissions["print_high_quality"] = ( + permissions_code & (1 << 12 - 1) != 0 + ) # bit 12 + return permissions + + def _decrypt(self, password): + # Decrypts data as per Section 3.5 (page 117) of PDF spec v1.7 + # "The security handler defines the use of encryption and decryption in + # the document, using the rules specified by the CF, StmF, and StrF entries" + encrypt = self.trailer[TK.ENCRYPT].getObject() + # /Encrypt Keys: + # Filter (name) : "name of the preferred security handler " + # V (number) : Algorithm Code + # Length (integer): Length of encryption key, in bits + # CF (dictionary) : Crypt filter + # StmF (name) : Name of the crypt filter that is used by default when decrypting streams + # StrF (name) : The name of the crypt filter that is used when decrypting all strings in the document + # R (number) : Standard security handler revision number + # U (string) : A 32-byte string, based on the user password + # P (integer) : Permissions allowed with user access + if encrypt["/Filter"] != "/Standard": + raise NotImplementedError( + "only Standard PDF encryption handler is available" + ) + if not (encrypt["/V"] in (1, 2)): + raise NotImplementedError( + "only algorithm code 1 and 2 are supported. This PDF uses code %s" + % encrypt["/V"] + ) + user_password, key = self._authenticateUserPassword(password) + if user_password: + self._decryption_key = key + return 1 + else: + rev = encrypt["/R"].getObject() + if rev == 2: + keylen = 5 + else: + keylen = encrypt[SA.LENGTH].getObject() // 8 + key = _alg33_1(password, rev, keylen) + real_O = encrypt["/O"].getObject() + if rev == 2: + userpass = utils.RC4_encrypt(key, real_O) + else: + val = real_O + for i in range(19, -1, -1): + new_key = b_("") + for l in range(len(key)): + new_key += b_(chr(utils.ord_(key[l]) ^ i)) + val = utils.RC4_encrypt(new_key, val) + userpass = val + owner_password, key = self._authenticateUserPassword(userpass) + if owner_password: + self._decryption_key = key + return 2 + return 0 + + def _authenticateUserPassword(self, password): + encrypt = self.trailer[TK.ENCRYPT].getObject() + rev = encrypt["/R"].getObject() + owner_entry = encrypt["/O"].getObject() + p_entry = encrypt["/P"].getObject() + if TK.ID in self.trailer: + id_entry = self.trailer[TK.ID].getObject() + else: + # Some documents may not have a /ID, use two empty + # byte strings instead. Solves + # https://github.com/mstamy2/PyPDF2/issues/608 + id_entry = ArrayObject([ByteStringObject(b""), ByteStringObject(b"")]) + id1_entry = id_entry[0].getObject() + real_U = encrypt["/U"].getObject().original_bytes + if rev == 2: + U, key = _alg34(password, owner_entry, p_entry, id1_entry) + elif rev >= 3: + U, key = _alg35( + password, + rev, + encrypt[SA.LENGTH].getObject() // 8, + owner_entry, + p_entry, + id1_entry, + encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject(), + ) + U, real_U = U[:16], real_U[:16] + return U == real_U, key + + def getIsEncrypted(self): + return TK.ENCRYPT in self.trailer + + @property + def isEncrypted(self): + """ + Read-only boolean property showing whether this PDF file is encrypted. + Note that this property, if true, will remain true even after the + :meth:`decrypt()` method is called. + """ + return self.getIsEncrypted() diff --git a/PyPDF2/_security.py b/PyPDF2/_security.py new file mode 100644 index 000000000..d493b4991 --- /dev/null +++ b/PyPDF2/_security.py @@ -0,0 +1,164 @@ +"""Anything related to encryption / decryption.""" + +import struct +from hashlib import md5 + +from PyPDF2 import utils +from PyPDF2.utils import b_, ord_, str_ + +# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 +_encryption_padding = ( + b_("\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56") + + b_("\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c") + + b_("\xa9\xfe\x64\x53\x69\x7a") +) + + +# Implementation of algorithm 3.2 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg32( + password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True +): + # 1. Pad or truncate the password string to exactly 32 bytes. If the + # password string is more than 32 bytes long, use only its first 32 bytes; + # if it is less than 32 bytes long, pad it by appending the required number + # of additional bytes from the beginning of the padding string + # (_encryption_padding). + password = b_((str_(password) + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + m = md5(password) + # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash + # function. + m.update(owner_entry.original_bytes) + # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass + # these bytes to the MD5 hash function, low-order byte first. + p_entry = struct.pack("= 3 and not metadata_encrypt: + m.update(b_("\xff\xff\xff\xff")) + # 7. Finish the hash. + md5_hash = m.digest() + # 8. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass the first n bytes of the output as + # input into a new MD5 hash, where n is the number of bytes of the + # encryption key as defined by the value of the encryption dictionary's + # /Length entry. + if rev >= 3: + for _ in range(50): + md5_hash = md5(md5_hash[:keylen]).digest() + # 9. Set the encryption key to the first n bytes of the output from the + # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or + # greater, depends on the value of the encryption dictionary's /Length + # entry. + return md5_hash[:keylen] + + +# Implementation of algorithm 3.3 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg33(owner_pwd, user_pwd, rev, keylen): + # steps 1 - 4 + key = _alg33_1(owner_pwd, rev, keylen) + # 5. Pad or truncate the user password string as described in step 1 of + # algorithm 3.2. + user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) + # 6. Encrypt the result of step 5, using an RC4 encryption function with + # the encryption key obtained in step 4. + val = utils.RC4_encrypt(key, user_pwd) + # 7. (Revision 3 or greater) Do the following 19 times: Take the output + # from the previous invocation of the RC4 function and pass it as input to + # a new invocation of the function; use an encryption key generated by + # taking each byte of the encryption key obtained in step 4 and performing + # an XOR operation between that byte and the single-byte value of the + # iteration counter (from 1 to 19). + if rev >= 3: + for i in range(1, 20): + new_key = "" + for l in range(len(key)): + new_key += chr(ord_(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + # 8. Store the output from the final invocation of the RC4 as the value of + # the /O entry in the encryption dictionary. + return val + + +# Steps 1-4 of algorithm 3.3 +def _alg33_1(password, rev, keylen): + # 1. Pad or truncate the owner password string as described in step 1 of + # algorithm 3.2. If there is no owner password, use the user password + # instead. + password = b_((password + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + m = md5(password) + # 3. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass it as input into a new MD5 hash. + md5_hash = m.digest() + if rev >= 3: + for _ in range(50): + md5_hash = md5(md5_hash).digest() + # 4. Create an RC4 encryption key using the first n bytes of the output + # from the final MD5 hash, where n is always 5 for revision 2 but, for + # revision 3 or greater, depends on the value of the encryption + # dictionary's /Length entry. + key = md5_hash[:keylen] + return key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg34(password, owner_entry, p_entry, id1_entry): + # 1. Create an encryption key based on the user password string, as + # described in algorithm 3.2. + key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) + # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, + # using an RC4 encryption function with the encryption key from the + # preceding step. + U = utils.RC4_encrypt(key, _encryption_padding) + # 3. Store the result of step 2 as the value of the /U entry in the + # encryption dictionary. + return U, key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): + # 1. Create an encryption key based on the user password string, as + # described in Algorithm 3.2. + key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + # 2. Initialize the MD5 hash function and pass the 32-byte padding string + # shown in step 1 of Algorithm 3.2 as input to this function. + m = md5() + m.update(_encryption_padding) + # 3. Pass the first element of the file's file identifier array (the value + # of the ID entry in the document's trailer dictionary; see Table 3.13 on + # page 73) to the hash function and finish the hash. (See implementation + # note 25 in Appendix H.) + m.update(id1_entry.original_bytes) + md5_hash = m.digest() + # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption + # function with the encryption key from step 1. + val = utils.RC4_encrypt(key, md5_hash) + # 5. Do the following 19 times: Take the output from the previous + # invocation of the RC4 function and pass it as input to a new invocation + # of the function; use an encryption key generated by taking each byte of + # the original encryption key (obtained in step 2) and performing an XOR + # operation between that byte and the single-byte value of the iteration + # counter (from 1 to 19). + for i in range(1, 20): + new_key = b_("") + for k in key: + new_key += b_(chr(ord_(k) ^ i)) + val = utils.RC4_encrypt(new_key, val) + # 6. Append 16 bytes of arbitrary padding to the output from the final + # invocation of the RC4 function and store the 32-byte result as the value + # of the U entry in the encryption dictionary. + # (implementator note: I don't know what "arbitrary padding" is supposed to + # mean, so I have used null bytes. This seems to match a few other + # people's implementations) + return val + (b_("\x00") * 16), key diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py new file mode 100644 index 000000000..c9f9aec63 --- /dev/null +++ b/PyPDF2/_writer.py @@ -0,0 +1,1228 @@ +import codecs +import logging +import struct +import uuid +import warnings +from hashlib import md5 + +from PyPDF2._page import PageObject +from PyPDF2._security import _alg33, _alg34, _alg35 +from PyPDF2.constants import CatalogAttributes as CA +from PyPDF2.constants import Core as CO +from PyPDF2.constants import PageAttributes as PG +from PyPDF2.constants import PagesAttributes as PA +from PyPDF2.constants import StreamAttributes as SA +from PyPDF2.constants import TrailerKeys as TK +from PyPDF2.generic import ( + ArrayObject, + BooleanObject, + ByteStringObject, + ContentStream, + DecodedStreamObject, + Destination, + DictionaryObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + RectangleObject, + StreamObject, + TextStringObject, + TreeObject, + createStringObject, +) +from PyPDF2.utils import b_, isString, u_ + +logger = logging.getLogger(__name__) + + +class PdfFileWriter(object): + """ + This class supports writing PDF files out, given pages produced by another + class (typically :class:`PdfFileReader`). + """ + + def __init__(self): + self._header = b_("%PDF-1.3") + self._objects = [] # array of indirect objects + + # The root of our page tree node. + pages = DictionaryObject() + pages.update( + { + NameObject(PA.TYPE): NameObject("/Pages"), + NameObject(PA.COUNT): NumberObject(0), + NameObject(PA.KIDS): ArrayObject(), + } + ) + self._pages = self._addObject(pages) + + # info object + info = DictionaryObject() + info.update( + { + NameObject("/Producer"): createStringObject( + codecs.BOM_UTF16_BE + u_("PyPDF2").encode("utf-16be") + ) + } + ) + self._info = self._addObject(info) + + # root object + root = DictionaryObject() + root.update( + { + NameObject(PA.TYPE): NameObject(CO.CATALOG), + NameObject(CO.PAGES): self._pages, + } + ) + self._root = None + self._root_object = root + self.set_need_appearances_writer() + + def _addObject(self, obj): + self._objects.append(obj) + return IndirectObject(len(self._objects), 0, self) + + def getObject(self, ido): + if ido.pdf != self: + raise ValueError("pdf must be self") + return self._objects[ido.idnum - 1] + + def _addPage(self, page, action): + assert page[PA.TYPE] == CO.PAGE + page[NameObject(PA.PARENT)] = self._pages + page = self._addObject(page) + pages = self.getObject(self._pages) + action(pages[PA.KIDS], page) + pages[NameObject(PA.COUNT)] = NumberObject(pages[PA.COUNT] + 1) + + def set_need_appearances_writer(self): + # See 12.7.2 and 7.7.2 for more information: + # http://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf + try: + catalog = self._root_object + # get the AcroForm tree + if "/AcroForm" not in catalog: + self._root_object.update( + { + NameObject("/AcroForm"): IndirectObject( + len(self._objects), 0, self + ) + } + ) + + need_appearances = NameObject("/NeedAppearances") + self._root_object["/AcroForm"][need_appearances] = BooleanObject(True) + + except Exception as e: + logger.error("set_need_appearances_writer() catch : ", repr(e)) + + def addPage(self, page): + """ + Adds a page to this PDF file. The page is usually acquired from a + :class:`PdfFileReader` instance. + + :param PageObject page: The page to add to the document. Should be + an instance of :class:`PageObject` + """ + self._addPage(page, list.append) + + def insertPage(self, page, index=0): + """ + Insert a page in this PDF file. The page is usually acquired from a + :class:`PdfFileReader` instance. + + :param PageObject page: The page to add to the document. This + argument should be an instance of :class:`PageObject`. + :param int index: Position at which the page will be inserted. + """ + self._addPage(page, lambda l, p: l.insert(index, p)) + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: the page at the index given by *pageNumber* + :rtype: :class:`PageObject` + """ + pages = self.getObject(self._pages) + # XXX: crude hack + return pages[PA.KIDS][pageNumber].getObject() + + def getNumPages(self): + """ + :return: the number of pages. + :rtype: int + """ + pages = self.getObject(self._pages) + return int(pages[NameObject("/Count")]) + + def addBlankPage(self, width=None, height=None): + """ + Appends a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :return: the newly appended page + :rtype: :class:`PageObject` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + page = PageObject.createBlankPage(self, width, height) + self.addPage(page) + return page + + def insertBlankPage(self, width=None, height=None, index=0): + """ + Inserts a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :param int index: Position to add the page. + :return: the newly appended page + :rtype: :class:`PageObject` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + if width is None or height is None and (self.getNumPages() - 1) >= index: + oldpage = self.getPage(index) + width = oldpage.mediaBox.getWidth() + height = oldpage.mediaBox.getHeight() + page = PageObject.createBlankPage(self, width, height) + self.insertPage(page, index) + return page + + def addJS(self, javascript): + """ + Add Javascript which will launch upon opening this PDF. + + :param str javascript: Your Javascript. + + >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + # Example: This will launch the print window when the PDF is opened. + """ + js = DictionaryObject() + js.update( + { + NameObject(PA.TYPE): NameObject("/Action"), + NameObject("/S"): NameObject("/JavaScript"), + NameObject("/JS"): NameObject("(%s)" % javascript), + } + ) + js_indirect_object = self._addObject(js) + + # We need a name for parameterized javascript in the pdf file, but it can be anything. + js_string_name = str(uuid.uuid4()) + + js_name_tree = DictionaryObject() + js_name_tree.update( + { + NameObject("/JavaScript"): DictionaryObject( + { + NameObject(CA.NAMES): ArrayObject( + [createStringObject(js_string_name), js_indirect_object] + ) + } + ) + } + ) + self._addObject(js_name_tree) + + self._root_object.update( + { + NameObject("/OpenAction"): js_indirect_object, + NameObject(CA.NAMES): js_name_tree, + } + ) + + def addAttachment(self, fname, fdata): + """ + Embed a file inside the PDF. + + :param str fname: The filename to display. + :param str fdata: The data in the file. + + Reference: + https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf + Section 7.11.3 + """ + # We need three entries: + # * The file's data + # * The /Filespec entry + # * The file's name, which goes in the Catalog + + # The entry for the file + """ Sample: + 8 0 obj + << + /Length 12 + /Type /EmbeddedFile + >> + stream + Hello world! + endstream + endobj + """ + file_entry = DecodedStreamObject() + file_entry.setData(fdata) + file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) + + # The Filespec entry + """ Sample: + 7 0 obj + << + /Type /Filespec + /F (hello.txt) + /EF << /F 8 0 R >> + >> + """ + ef_entry = DictionaryObject() + ef_entry.update({NameObject("/F"): file_entry}) + + filespec = DictionaryObject() + filespec.update( + { + NameObject(PA.TYPE): NameObject("/Filespec"), + NameObject("/F"): createStringObject( + fname + ), # Perhaps also try TextStringObject + NameObject("/EF"): ef_entry, + } + ) + + # Then create the entry for the root, as it needs a reference to the Filespec + """ Sample: + 1 0 obj + << + /Type /Catalog + /Outlines 2 0 R + /Pages 3 0 R + /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> + >> + endobj + + """ + embeddedFilesNamesDictionary = DictionaryObject() + embeddedFilesNamesDictionary.update( + {NameObject(CA.NAMES): ArrayObject([createStringObject(fname), filespec])} + ) + + embeddedFilesDictionary = DictionaryObject() + embeddedFilesDictionary.update( + {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary} + ) + # Update the root + self._root_object.update({NameObject(CA.NAMES): embeddedFilesDictionary}) + + def appendPagesFromReader(self, reader, after_page_append=None): + """ + Copy pages from reader to writer. Includes an optional callback parameter + which is invoked after pages are appended to the writer. + + :param reader: a PdfFileReader object from which to copy page + annotations to this writer object. The writer's annots + will then be updated + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Callback signature: + :param writer_pageref (PDF page reference): Reference to the page + appended to the writer. + """ + # Get page count from writer and reader + reader_num_pages = reader.getNumPages() + writer_num_pages = self.getNumPages() + + # Copy pages from reader to writer + for rpagenum in range(0, reader_num_pages): + reader_page = reader.getPage(rpagenum) + self.addPage(reader_page) + writer_page = self.getPage(writer_num_pages + rpagenum) + # Trigger callback, pass writer page as parameter + if callable(after_page_append): + after_page_append(writer_page) + + def updatePageFormFieldValues(self, page, fields, flags=0): + """ + Update the form field values for a given page from a fields dictionary. + Copy field texts and values from fields to page. + If the field links to a parent object, add the information to the parent. + + :param page: Page reference from PDF writer where the annotations + and field data will be updated. + :param fields: a Python dictionary of field names (/T) and text + values (/V) + :param flags: An integer (0 to 7). The first bit sets ReadOnly, the + second bit sets Required, the third bit sets NoExport. See + PDF Reference Table 8.70 for details. + """ + # Iterate through pages, update field values + for j in range(0, len(page[PG.ANNOTS])): + writer_annot = page[PG.ANNOTS][j].getObject() + # retrieve parent field values, if present + writer_parent_annot = {} # fallback if it's not there + if PG.PARENT in writer_annot: + writer_parent_annot = writer_annot[PG.PARENT] + for field in fields: + if writer_annot.get("/T") == field: + writer_annot.update( + {NameObject("/V"): TextStringObject(fields[field])} + ) + if flags: + writer_annot.update({NameObject("/Ff"): NumberObject(flags)}) + elif writer_parent_annot.get("/T") == field: + writer_parent_annot.update( + {NameObject("/V"): TextStringObject(fields[field])} + ) + + def cloneReaderDocumentRoot(self, reader): + """ + Copy the reader document root to the writer. + + :param reader: PdfFileReader from the document root should be copied. + :callback after_page_append: + """ + self._root_object = reader.trailer[TK.ROOT] + + def cloneDocumentFromReader(self, reader, after_page_append=None): + """ + Create a copy (clone) of a document from a PDF file reader + + :param reader: PDF file reader instance from which the clone + should be created. + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Signature includes a reference to the + appended page (delegates to appendPagesFromReader). Callback signature: + + :param writer_pageref (PDF page reference): Reference to the page just + appended to the document. + """ + self.cloneReaderDocumentRoot(reader) + self.appendPagesFromReader(reader, after_page_append) + + def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True, permissions_flag=-1): + """ + Encrypt this PDF file with the PDF Standard encryption handler. + + :param str user_pwd: The "user password", which allows for opening + and reading the PDF file with the restrictions provided. + :param str owner_pwd: The "owner password", which allows for + opening the PDF files without any restrictions. By default, + the owner password is the same as the user password. + :param bool use_128bit: flag as to whether to use 128bit + encryption. When false, 40bit encryption will be used. By default, + this flag is on. + :param unsigned int permissions_flag: permissions as described in + TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the + permission is grantend. Hence an integer value of -1 will set all + flags. + Bit position 3 is for printing, 4 is for modifying content, 5 and 6 + control annotations, 9 for form fields, 10 for extraction of + text and graphics. + """ + import random + import time + + if owner_pwd is None: + owner_pwd = user_pwd + if use_128bit: + V = 2 + rev = 3 + keylen = int(128 / 8) + else: + V = 1 + rev = 2 + keylen = int(40 / 8) + P = permissions_flag + O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) + ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) + ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) + self._ID = ArrayObject((ID_1, ID_2)) + if rev == 2: + U, key = _alg34(user_pwd, O, P, ID_1) + else: + assert rev == 3 + U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) + encrypt = DictionaryObject() + encrypt[NameObject(SA.FILTER)] = NameObject("/Standard") + encrypt[NameObject("/V")] = NumberObject(V) + if V == 2: + encrypt[NameObject(SA.LENGTH)] = NumberObject(keylen * 8) + encrypt[NameObject("/R")] = NumberObject(rev) + encrypt[NameObject("/O")] = ByteStringObject(O) + encrypt[NameObject("/U")] = ByteStringObject(U) + encrypt[NameObject("/P")] = NumberObject(P) + self._encrypt = self._addObject(encrypt) + self._encrypt_key = key + + def write(self, stream): + """ + Writes the collection of pages added to this object out as a PDF file. + + :param stream: An object to write the file to. The object must support + the write method and the tell method, similar to a file object. + """ + if hasattr(stream, "mode") and "b" not in stream.mode: + warnings.warn( + "File <%s> to write to is not in binary mode. It may not be written to correctly." + % stream.name + ) + + if not self._root: + self._root = self._addObject(self._root_object) + + external_reference_map = {} + + # PDF objects sometimes have circular references to their /Page objects + # inside their object tree (for example, annotations). Those will be + # indirect references to objects that we've recreated in this PDF. To + # address this problem, PageObject's store their original object + # reference number, and we add it to the external reference map before + # we sweep for indirect references. This forces self-page-referencing + # trees to reference the correct new object location, rather than + # copying in a new copy of the page object. + for obj_index in range(len(self._objects)): + obj = self._objects[obj_index] + if isinstance(obj, PageObject) and obj.indirectRef is not None: + data = obj.indirectRef + if data.pdf not in external_reference_map: + external_reference_map[data.pdf] = {} + if data.generation not in external_reference_map[data.pdf]: + external_reference_map[data.pdf][data.generation] = {} + external_reference_map[data.pdf][data.generation][ + data.idnum + ] = IndirectObject(obj_index + 1, 0, self) + + self.stack = [] + self._sweepIndirectReferences(external_reference_map, self._root) + del self.stack + + object_positions = self._write_header(stream) + xref_location = self._write_xref_table(stream, object_positions) + self._write_trailer(stream) + stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) # eof + + def _write_header(self, stream): + object_positions = [] + stream.write(self._header + b_("\n")) + stream.write(b_("%\xE2\xE3\xCF\xD3\n")) + for i in range(len(self._objects)): + obj = self._objects[i] + # If the obj is None we can't write anything + if obj is not None: + idnum = i + 1 + object_positions.append(stream.tell()) + stream.write(b_(str(idnum) + " 0 obj\n")) + key = None + if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: + pack1 = struct.pack("` for details. + """ + page_ref = self.getObject(self._pages)[PA.KIDS][pagenum] + action = DictionaryObject() + zoom_args = [] + for a in args: + if a is not None: + zoom_args.append(NumberObject(a)) + else: + zoom_args.append(NullObject()) + dest = Destination( + NameObject("/" + title + " bookmark"), page_ref, NameObject(fit), *zoom_args + ) + dest_array = dest.getDestArray() + action.update( + {NameObject("/D"): dest_array, NameObject("/S"): NameObject("/GoTo")} + ) + action_ref = self._addObject(action) + + outline_ref = self.getOutlineRoot() + + if parent is None: + parent = outline_ref + + bookmark = TreeObject() + + bookmark.update( + { + NameObject("/A"): action_ref, + NameObject("/Title"): createStringObject(title), + } + ) + + if color is not None: + bookmark.update( + {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} + ) + + format = 0 + if italic: + format += 1 + if bold: + format += 2 + if format: + bookmark.update({NameObject("/F"): NumberObject(format)}) + + bookmark_ref = self._addObject(bookmark) + + parent = parent.getObject() + parent.addChild(bookmark_ref, self) + + return bookmark_ref + + def addNamedDestinationObject(self, dest): + dest_ref = self._addObject(dest) + + nd = self.getNamedDestRoot() + nd.extend([dest["/Title"], dest_ref]) + + return dest_ref + + def addNamedDestination(self, title, pagenum): + page_ref = self.getObject(self._pages)[PA.KIDS][pagenum] + dest = DictionaryObject() + dest.update( + { + NameObject("/D"): ArrayObject( + [page_ref, NameObject("/FitH"), NumberObject(826)] + ), + NameObject("/S"): NameObject("/GoTo"), + } + ) + + dest_ref = self._addObject(dest) + nd = self.getNamedDestRoot() + + nd.extend([title, dest_ref]) + + return dest_ref + + def removeLinks(self): + """ + Removes links and annotations from this output. + """ + pages = self.getObject(self._pages)[PA.KIDS] + for page in pages: + page_ref = self.getObject(page) + if PG.ANNOTS in page_ref: + del page_ref[PG.ANNOTS] + + def removeImages(self, ignoreByteStringObject=False): + """ + Removes images from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)[PA.KIDS] + jump_operators = [ + b_("cm"), + b_("w"), + b_("J"), + b_("j"), + b_("M"), + b_("d"), + b_("ri"), + b_("i"), + b_("gs"), + b_("W"), + b_("b"), + b_("s"), + b_("S"), + b_("f"), + b_("F"), + b_("n"), + b_("m"), + b_("l"), + b_("c"), + b_("v"), + b_("y"), + b_("h"), + b_("B"), + b_("Do"), + b_("sh"), + ] + for j in range(len(pages)): + page = pages[j] + page_ref = self.getObject(page) + content = page_ref["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, page_ref) + + _operations = [] + seq_graphics = False + for operands, operator in content.operations: + if operator in [b_("Tj"), b_("'")]: + text = operands[0] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if ignoreByteStringObject and not isinstance( + text, TextStringObject + ): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if ignoreByteStringObject and not isinstance( + operands[0][i], TextStringObject + ): + operands[0][i] = TextStringObject() + + if operator == b_("q"): + seq_graphics = True + if operator == b_("Q"): + seq_graphics = False + if seq_graphics and operator in jump_operators: + continue + if operator == b_("re"): + continue + _operations.append((operands, operator)) + + content.operations = _operations + page_ref.__setitem__(NameObject("/Contents"), content) + + def removeText(self, ignoreByteStringObject=False): + """ + Removes text from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)[PA.KIDS] + for j in range(len(pages)): + page = pages[j] + page_ref = self.getObject(page) + content = page_ref["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, page_ref) + for operands, operator in content.operations: + if operator in [b_("Tj"), b_("'")]: + text = operands[0] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, (TextStringObject, ByteStringObject)): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[2] = TextStringObject() + else: + if isinstance(text, (TextStringObject, ByteStringObject)): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if not ignoreByteStringObject: + if isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + else: + if isinstance( + operands[0][i], (TextStringObject, ByteStringObject) + ): + operands[0][i] = TextStringObject() + + page_ref.__setitem__(NameObject("/Contents"), content) + + def addURI(self, pagenum, uri, rect, border=None): + """ + Add an URI from a rectangular area to the specified page. + This uses the basic structure of AddLink + + :param int pagenum: index of the page on which to place the URI action. + :param int uri: string -- uri of resource to link to. + :param rect: :class:`RectangleObject` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + + REMOVED FIT/ZOOM ARG + -John Mulligan + """ + + page_link = self.getObject(self._pages)[PA.KIDS][pagenum] + page_ref = self.getObject(page_link) + + if border is not None: + border_arr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dash_pattern = ArrayObject([NameObject(n) for n in border[3]]) + border_arr.append(dash_pattern) + else: + border_arr = [NumberObject(2)] * 3 + + if isString(rect): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + lnk2 = DictionaryObject() + lnk2.update( + { + NameObject("/S"): NameObject("/URI"), + NameObject("/URI"): TextStringObject(uri), + } + ) + lnk = DictionaryObject() + lnk.update( + { + NameObject("/Type"): NameObject(PG.ANNOTS), + NameObject("/Subtype"): NameObject("/Link"), + NameObject("/P"): page_link, + NameObject("/Rect"): rect, + NameObject("/H"): NameObject("/I"), + NameObject("/Border"): ArrayObject(border_arr), + NameObject("/A"): lnk2, + } + ) + lnk_ref = self._addObject(lnk) + + if PG.ANNOTS in page_ref: + page_ref[PG.ANNOTS].append(lnk_ref) + else: + page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) + + def addLink(self, pagenum, pagedest, rect, border=None, fit="/Fit", *args): + """ + Add an internal link from a rectangular area to the specified page. + + :param int pagenum: index of the page on which to place the link. + :param int pagedest: index of the page to which the link should go. + :param rect: :class:`RectangleObject` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need + to be supplied. Passing ``None`` will be read as a null value for that coordinate. + + .. list-table:: Valid ``zoom`` arguments (see Table 8.2 of the PDF 1.7 reference for details) + :widths: 50 200 + + * - /Fit + - No additional arguments + * - /XYZ + - [left] [top] [zoomFactor] + * - /FitH + - [top] + * - /FitV + - [left] + * - /FitR + - [left] [bottom] [right] [top] + * - /FitB + - No additional arguments + * - /FitBH + - [top] + * - /FitBV + - [left] + """ + + page_link = self.getObject(self._pages)[PA.KIDS][pagenum] + page_dest = self.getObject(self._pages)[PA.KIDS][ + pagedest + ] # TODO: switch for external link + page_ref = self.getObject(page_link) + + if border is not None: + border_arr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dash_pattern = ArrayObject([NameObject(n) for n in border[3]]) + border_arr.append(dash_pattern) + else: + border_arr = [NumberObject(0)] * 3 + + if isString(rect): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + zoom_args = [] + for a in args: + if a is not None: + zoom_args.append(NumberObject(a)) + else: + zoom_args.append(NullObject()) + dest = Destination( + NameObject("/LinkName"), page_dest, NameObject(fit), *zoom_args + ) # TODO: create a better name for the link + dest_array = dest.getDestArray() + + lnk = DictionaryObject() + lnk.update( + { + NameObject("/Type"): NameObject(PG.ANNOTS), + NameObject("/Subtype"): NameObject("/Link"), + NameObject("/P"): page_link, + NameObject("/Rect"): rect, + NameObject("/Border"): ArrayObject(border_arr), + NameObject("/Dest"): dest_array, + } + ) + lnk_ref = self._addObject(lnk) + + if PG.ANNOTS in page_ref: + page_ref[PG.ANNOTS].append(lnk_ref) + else: + page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) + + _valid_layouts = [ + "/NoLayout", + "/SinglePage", + "/OneColumn", + "/TwoColumnLeft", + "/TwoColumnRight", + "/TwoPageLeft", + "/TwoPageRight", + ] + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()` for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: str, None if not specified + """ + try: + return self._root_object["/PageLayout"] + except KeyError: + return None + + def setPageLayout(self, layout): + """ + Set the page layout. + + :param str layout: The page layout to be used. + + .. list-table:: Valid ``layout`` arguments + :widths: 50 200 + + * - /NoLayout + - Layout explicitly not specified + * - /SinglePage + - Show one page at a time + * - /OneColumn + - Show one column at a time + * - /TwoColumnLeft + - Show pages in two columns, odd-numbered pages on the left + * - /TwoColumnRight + - Show pages in two columns, odd-numbered pages on the right + * - /TwoPageLeft + - Show two pages at a time, odd-numbered pages on the left + * - /TwoPageRight + - Show two pages at a time, odd-numbered pages on the right + """ + if not isinstance(layout, NameObject): + if layout not in self._valid_layouts: + warnings.warn( + "Layout should be one of: {}".format(", ".join(self._valid_layouts)) + ) + layout = NameObject(layout) + self._root_object.update({NameObject("/PageLayout"): layout}) + + pageLayout = property(getPageLayout, setPageLayout) + """Read and write property accessing the :meth:`getPageLayout()` + and :meth:`setPageLayout()` methods.""" + + _valid_modes = [ + "/UseNone", + "/UseOutlines", + "/UseThumbs", + "/FullScreen", + "/UseOC", + "/UseAttachments", + ] + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()` for a description + of valid modes. + + :return: Page mode currently being used. + :rtype: str, None if not specified. + """ + try: + return self._root_object["/PageMode"] + except KeyError: + return None + + def setPageMode(self, mode): + """ + Set the page mode. + + :param str mode: The page mode to use. + + .. list-table:: Valid ``mode`` arguments + :widths: 50 200 + + * - /UseNone + - Do not show outlines or thumbnails panels + * - /UseOutlines + - Show outlines (aka bookmarks) panel + * - /UseThumbs + - Show page thumbnails panel + * - /FullScreen + - Fullscreen view + * - /UseOC + - Show Optional Content Group (OCG) panel + * - /UseAttachments + - Show attachments panel + """ + if not isinstance(mode, NameObject): + if mode not in self._valid_modes: + warnings.warn( + "Mode should be one of: {}".format(", ".join(self._valid_modes)) + ) + mode = NameObject(mode) + self._root_object.update({NameObject("/PageMode"): mode}) + + pageMode = property(getPageMode, setPageMode) + """Read and write property accessing the :meth:`getPageMode()` + and :meth:`setPageMode()` methods.""" diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 08a0cbc99..c78fcdff7 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -39,6 +39,7 @@ import re import sys import warnings +from sys import version_info from PyPDF2.constants import FilterTypes as FT from PyPDF2.constants import StreamAttributes as SA @@ -60,6 +61,15 @@ u_, ) +if version_info < (3, 0): + from cStringIO import StringIO +else: + from io import StringIO +if version_info < (3, 0): + BytesIO = StringIO +else: + from io import BytesIO + logger = logging.getLogger(__name__) ObjectPrefix = b_("/<[tf(n%") NumberSigns = b_("+-") @@ -900,6 +910,140 @@ def setData(self, data): raise PdfReadError("Creating EncodedStreamObject is not currently supported") +class ContentStream(DecodedStreamObject): + def __init__(self, stream, pdf): + self.pdf = pdf + self.operations = [] + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + stream = stream.getObject() + if isinstance(stream, ArrayObject): + data = b_("") + for s in stream: + data += b_(s.getObject().getData()) + stream = BytesIO(b_(data)) + else: + stream = BytesIO(b_(stream.getData())) + self.__parseContentStream(stream) + + def __parseContentStream(self, stream): + # file("f:\\tmp.txt", "w").write(stream.read()) + stream.seek(0, 0) + operands = [] + while True: + peek = readNonWhitespace(stream) + if peek == b_("") or ord_(peek) == 0: + break + stream.seek(-1, 1) + if peek.isalpha() or peek == b_("'") or peek == b_('"'): + operator = utils.readUntilRegex( + stream, NameObject.delimiterPattern, True + ) + if operator == b_("BI"): + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._readInlineImage(stream) + self.operations.append((ii, b_("INLINE IMAGE"))) + else: + self.operations.append((operands, operator)) + operands = [] + elif peek == b_("%"): + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, readObject will handle + # encountering a comment -- but readObject assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in (b_("\r"), b_("\n")): + peek = stream.read(1) + else: + operands.append(readObject(stream, None)) + + def _readInlineImage(self, stream): + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + if tok == b_("I"): + # "ID" - begin of image data + break + key = readObject(stream, self.pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == b_("ID") + data = BytesIO() + # Read the inline image, while checking for EI (End Image) operator. + while True: + # Read 8 kB at a time and check if the chunk contains the E operator. + buf = stream.read(8192) + # We have reached the end of the stream, but haven't found the EI operator. + if not buf: + raise PdfReadError("Unexpected end of stream") + loc = buf.find(b_("E")) + + if loc == -1: + data.write(buf) + else: + # Write out everything before the E. + data.write(buf[0:loc]) + + # Seek back in the stream to read the E next. + stream.seek(loc - len(buf), 1) + tok = stream.read(1) + # Check for End Image + tok2 = stream.read(1) + if tok2 == b_("I"): + # Data can contain EI, so check for the Q operator. + tok3 = stream.read(1) + info = tok + tok2 + # We need to find whitespace between EI and Q. + has_q_whitespace = False + while tok3 in utils.WHITESPACES: + has_q_whitespace = True + info += tok3 + tok3 = stream.read(1) + if tok3 == b_("Q") and has_q_whitespace: + stream.seek(-1, 1) + break + else: + stream.seek(-1, 1) + data.write(info) + else: + stream.seek(-1, 1) + data.write(tok) + return {"settings": settings, "data": data.getvalue()} + + def _getData(self): + newdata = BytesIO() + for operands, operator in self.operations: + if operator == b_("INLINE IMAGE"): + newdata.write(b_("BI")) + dicttext = BytesIO() + operands["settings"].writeToStream(dicttext, None) + newdata.write(dicttext.getvalue()[2:-2]) + newdata.write(b_("ID ")) + newdata.write(operands["data"]) + newdata.write(b_("EI")) + else: + for op in operands: + op.writeToStream(newdata, None) + newdata.write(b_(" ")) + newdata.write(b_(operator)) + newdata.write(b_("\n")) + return newdata.getvalue() + + def _setData(self, value): + self.__parseContentStream(BytesIO(b_(value))) + + _data = property(_getData, _setData) + + class RectangleObject(ArrayObject): """ This class is used to represent *page boxes* in PyPDF2. These boxes include: diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py index 99ab7babc..2430a0d34 100644 --- a/PyPDF2/merger.py +++ b/PyPDF2/merger.py @@ -27,12 +27,12 @@ from sys import version_info +from PyPDF2._reader import PdfFileReader +from PyPDF2._writer import PdfFileWriter from PyPDF2.constants import PagesAttributes as PA - -from .generic import * -from .pagerange import PageRange -from .pdf import PdfFileReader, PdfFileWriter -from .utils import isString, str_ +from PyPDF2.generic import * +from PyPDF2.pagerange import PageRange +from PyPDF2.utils import isString, str_ if version_info < (3, 0): from cStringIO import StringIO diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 826f73369..9523a16ea 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -29,15 +29,15 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -"""A pure-Python PDF library with an increasing number of capabilities.""" +"""This module is deprecated. Import from PyPDF2 or PyPDF2.errors directly.""" __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" -import math -import struct -import sys -import uuid +import math # noqa: F401 +import struct # noqa: F401 +import sys # noqa: F401 +import uuid # noqa: F401 from sys import version_info if version_info < (3, 0): @@ -48,24 +48,31 @@ if version_info < (3, 0): BytesIO = StringIO else: - from io import BytesIO - -import codecs -import warnings -from hashlib import md5 - -from PyPDF2.constants import CatalogAttributes as CA -from PyPDF2.constants import Core as CO -from PyPDF2.constants import PageAttributes as PG -from PyPDF2.constants import PagesAttributes as PA -from PyPDF2.constants import Ressources as RES -from PyPDF2.constants import StreamAttributes as SA -from PyPDF2.constants import TrailerKeys as TK -from PyPDF2.errors import PageSizeNotDefinedError, PdfReadError, PdfReadWarning + from io import BytesIO # noqa: F401 + +import codecs # noqa: F401 +import warnings # noqa: F401 +from hashlib import md5 # noqa: F401 + +from PyPDF2._page import * # noqa: F401 +from PyPDF2._reader import * # noqa: F401 +from PyPDF2._writer import * # noqa: F401 +from PyPDF2.constants import CatalogAttributes as CA # noqa: F401 +from PyPDF2.constants import Core as CO # noqa: F401 +from PyPDF2.constants import PageAttributes as PG # noqa: F401 +from PyPDF2.constants import PagesAttributes as PA # noqa: F401 +from PyPDF2.constants import Ressources as RES # noqa: F401 +from PyPDF2.constants import StreamAttributes as SA # noqa: F401 +from PyPDF2.constants import TrailerKeys as TK # noqa: F401 +from PyPDF2.errors import ( # noqa: F401 + PageSizeNotDefinedError, + PdfReadError, + PdfReadWarning, +) -from . import utils -from .generic import * -from .utils import ( +from . import utils # noqa: F401 +from .generic import * # noqa: F401 +from .utils import ( # noqa: F401 ConvertFunctionsToVirtualList, b_, formatWarning, @@ -76,3500 +83,3 @@ str_, u_, ) - - -class PdfFileWriter(object): - """ - This class supports writing PDF files out, given pages produced by another - class (typically :class:`PdfFileReader`). - """ - - def __init__(self): - self._header = b_("%PDF-1.3") - self._objects = [] # array of indirect objects - - # The root of our page tree node. - pages = DictionaryObject() - pages.update( - { - NameObject(PA.TYPE): NameObject("/Pages"), - NameObject(PA.COUNT): NumberObject(0), - NameObject(PA.KIDS): ArrayObject(), - } - ) - self._pages = self._addObject(pages) - - # info object - info = DictionaryObject() - info.update( - { - NameObject("/Producer"): createStringObject( - codecs.BOM_UTF16_BE + u_("PyPDF2").encode("utf-16be") - ) - } - ) - self._info = self._addObject(info) - - # root object - root = DictionaryObject() - root.update( - { - NameObject(PA.TYPE): NameObject(CO.CATALOG), - NameObject(CO.PAGES): self._pages, - } - ) - self._root = None - self._root_object = root - self.set_need_appearances_writer() - - def _addObject(self, obj): - self._objects.append(obj) - return IndirectObject(len(self._objects), 0, self) - - def getObject(self, ido): - if ido.pdf != self: - raise ValueError("pdf must be self") - return self._objects[ido.idnum - 1] - - def _addPage(self, page, action): - assert page[PA.TYPE] == CO.PAGE - page[NameObject(PA.PARENT)] = self._pages - page = self._addObject(page) - pages = self.getObject(self._pages) - action(pages[PA.KIDS], page) - pages[NameObject(PA.COUNT)] = NumberObject(pages[PA.COUNT] + 1) - - def set_need_appearances_writer(self): - # See 12.7.2 and 7.7.2 for more information: - # http://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf - try: - catalog = self._root_object - # get the AcroForm tree - if "/AcroForm" not in catalog: - self._root_object.update( - { - NameObject("/AcroForm"): IndirectObject( - len(self._objects), 0, self - ) - } - ) - - need_appearances = NameObject("/NeedAppearances") - self._root_object["/AcroForm"][need_appearances] = BooleanObject(True) - - except Exception as e: - logger.error("set_need_appearances_writer() catch : ", repr(e)) - - def addPage(self, page): - """ - Adds a page to this PDF file. The page is usually acquired from a - :class:`PdfFileReader` instance. - - :param PageObject page: The page to add to the document. Should be - an instance of :class:`PageObject` - """ - self._addPage(page, list.append) - - def insertPage(self, page, index=0): - """ - Insert a page in this PDF file. The page is usually acquired from a - :class:`PdfFileReader` instance. - - :param PageObject page: The page to add to the document. This - argument should be an instance of :class:`PageObject`. - :param int index: Position at which the page will be inserted. - """ - self._addPage(page, lambda l, p: l.insert(index, p)) - - def getPage(self, pageNumber): - """ - Retrieves a page by number from this PDF file. - - :param int pageNumber: The page number to retrieve - (pages begin at zero) - :return: the page at the index given by *pageNumber* - :rtype: :class:`PageObject` - """ - pages = self.getObject(self._pages) - # XXX: crude hack - return pages[PA.KIDS][pageNumber].getObject() - - def getNumPages(self): - """ - :return: the number of pages. - :rtype: int - """ - pages = self.getObject(self._pages) - return int(pages[NameObject("/Count")]) - - def addBlankPage(self, width=None, height=None): - """ - Appends a blank page to this PDF file and returns it. If no page size - is specified, use the size of the last page. - - :param float width: The width of the new page expressed in default user - space units. - :param float height: The height of the new page expressed in default - user space units. - :return: the newly appended page - :rtype: :class:`PageObject` - :raises PageSizeNotDefinedError: if width and height are not defined - and previous page does not exist. - """ - page = PageObject.createBlankPage(self, width, height) - self.addPage(page) - return page - - def insertBlankPage(self, width=None, height=None, index=0): - """ - Inserts a blank page to this PDF file and returns it. If no page size - is specified, use the size of the last page. - - :param float width: The width of the new page expressed in default user - space units. - :param float height: The height of the new page expressed in default - user space units. - :param int index: Position to add the page. - :return: the newly appended page - :rtype: :class:`PageObject` - :raises PageSizeNotDefinedError: if width and height are not defined - and previous page does not exist. - """ - if width is None or height is None and (self.getNumPages() - 1) >= index: - oldpage = self.getPage(index) - width = oldpage.mediaBox.getWidth() - height = oldpage.mediaBox.getHeight() - page = PageObject.createBlankPage(self, width, height) - self.insertPage(page, index) - return page - - def addJS(self, javascript): - """ - Add Javascript which will launch upon opening this PDF. - - :param str javascript: Your Javascript. - - >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") - # Example: This will launch the print window when the PDF is opened. - """ - js = DictionaryObject() - js.update( - { - NameObject(PA.TYPE): NameObject("/Action"), - NameObject("/S"): NameObject("/JavaScript"), - NameObject("/JS"): NameObject("(%s)" % javascript), - } - ) - js_indirect_object = self._addObject(js) - - # We need a name for parameterized javascript in the pdf file, but it can be anything. - js_string_name = str(uuid.uuid4()) - - js_name_tree = DictionaryObject() - js_name_tree.update( - { - NameObject("/JavaScript"): DictionaryObject( - { - NameObject(CA.NAMES): ArrayObject( - [createStringObject(js_string_name), js_indirect_object] - ) - } - ) - } - ) - self._addObject(js_name_tree) - - self._root_object.update( - { - NameObject("/OpenAction"): js_indirect_object, - NameObject(CA.NAMES): js_name_tree, - } - ) - - def addAttachment(self, fname, fdata): - """ - Embed a file inside the PDF. - - :param str fname: The filename to display. - :param str fdata: The data in the file. - - Reference: - https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf - Section 7.11.3 - """ - # We need three entries: - # * The file's data - # * The /Filespec entry - # * The file's name, which goes in the Catalog - - # The entry for the file - """ Sample: - 8 0 obj - << - /Length 12 - /Type /EmbeddedFile - >> - stream - Hello world! - endstream - endobj - """ - file_entry = DecodedStreamObject() - file_entry.setData(fdata) - file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) - - # The Filespec entry - """ Sample: - 7 0 obj - << - /Type /Filespec - /F (hello.txt) - /EF << /F 8 0 R >> - >> - """ - ef_entry = DictionaryObject() - ef_entry.update({NameObject("/F"): file_entry}) - - filespec = DictionaryObject() - filespec.update( - { - NameObject(PA.TYPE): NameObject("/Filespec"), - NameObject("/F"): createStringObject( - fname - ), # Perhaps also try TextStringObject - NameObject("/EF"): ef_entry, - } - ) - - # Then create the entry for the root, as it needs a reference to the Filespec - """ Sample: - 1 0 obj - << - /Type /Catalog - /Outlines 2 0 R - /Pages 3 0 R - /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> - >> - endobj - - """ - embeddedFilesNamesDictionary = DictionaryObject() - embeddedFilesNamesDictionary.update( - {NameObject(CA.NAMES): ArrayObject([createStringObject(fname), filespec])} - ) - - embeddedFilesDictionary = DictionaryObject() - embeddedFilesDictionary.update( - {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary} - ) - # Update the root - self._root_object.update({NameObject(CA.NAMES): embeddedFilesDictionary}) - - def appendPagesFromReader(self, reader, after_page_append=None): - """ - Copy pages from reader to writer. Includes an optional callback parameter - which is invoked after pages are appended to the writer. - - :param reader: a PdfFileReader object from which to copy page - annotations to this writer object. The writer's annots - will then be updated - :callback after_page_append (function): Callback function that is invoked after - each page is appended to the writer. Callback signature: - :param writer_pageref (PDF page reference): Reference to the page - appended to the writer. - """ - # Get page count from writer and reader - reader_num_pages = reader.getNumPages() - writer_num_pages = self.getNumPages() - - # Copy pages from reader to writer - for rpagenum in range(0, reader_num_pages): - reader_page = reader.getPage(rpagenum) - self.addPage(reader_page) - writer_page = self.getPage(writer_num_pages + rpagenum) - # Trigger callback, pass writer page as parameter - if callable(after_page_append): - after_page_append(writer_page) - - def updatePageFormFieldValues(self, page, fields, flags=0): - """ - Update the form field values for a given page from a fields dictionary. - Copy field texts and values from fields to page. - If the field links to a parent object, add the information to the parent. - - :param page: Page reference from PDF writer where the annotations - and field data will be updated. - :param fields: a Python dictionary of field names (/T) and text - values (/V) - :param flags: An integer (0 to 7). The first bit sets ReadOnly, the - second bit sets Required, the third bit sets NoExport. See - PDF Reference Table 8.70 for details. - """ - # Iterate through pages, update field values - for j in range(0, len(page[PG.ANNOTS])): - writer_annot = page[PG.ANNOTS][j].getObject() - # retrieve parent field values, if present - writer_parent_annot = {} # fallback if it's not there - if PG.PARENT in writer_annot: - writer_parent_annot = writer_annot[PG.PARENT] - for field in fields: - if writer_annot.get("/T") == field: - writer_annot.update( - {NameObject("/V"): TextStringObject(fields[field])} - ) - if flags: - writer_annot.update({NameObject("/Ff"): NumberObject(flags)}) - elif writer_parent_annot.get("/T") == field: - writer_parent_annot.update( - {NameObject("/V"): TextStringObject(fields[field])} - ) - - def cloneReaderDocumentRoot(self, reader): - """ - Copy the reader document root to the writer. - - :param reader: PdfFileReader from the document root should be copied. - :callback after_page_append: - """ - self._root_object = reader.trailer[TK.ROOT] - - def cloneDocumentFromReader(self, reader, after_page_append=None): - """ - Create a copy (clone) of a document from a PDF file reader - - :param reader: PDF file reader instance from which the clone - should be created. - :callback after_page_append (function): Callback function that is invoked after - each page is appended to the writer. Signature includes a reference to the - appended page (delegates to appendPagesFromReader). Callback signature: - - :param writer_pageref (PDF page reference): Reference to the page just - appended to the document. - """ - self.cloneReaderDocumentRoot(reader) - self.appendPagesFromReader(reader, after_page_append) - - def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True, permissions_flag=-1): - """ - Encrypt this PDF file with the PDF Standard encryption handler. - - :param str user_pwd: The "user password", which allows for opening - and reading the PDF file with the restrictions provided. - :param str owner_pwd: The "owner password", which allows for - opening the PDF files without any restrictions. By default, - the owner password is the same as the user password. - :param bool use_128bit: flag as to whether to use 128bit - encryption. When false, 40bit encryption will be used. By default, - this flag is on. - :param unsigned int permissions_flag: permissions as described in - TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the - permission is grantend. Hence an integer value of -1 will set all - flags. - Bit position 3 is for printing, 4 is for modifying content, 5 and 6 - control annotations, 9 for form fields, 10 for extraction of - text and graphics. - """ - import random - import time - - if owner_pwd is None: - owner_pwd = user_pwd - if use_128bit: - V = 2 - rev = 3 - keylen = int(128 / 8) - else: - V = 1 - rev = 2 - keylen = int(40 / 8) - P = permissions_flag - O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) - ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) - ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) - self._ID = ArrayObject((ID_1, ID_2)) - if rev == 2: - U, key = _alg34(user_pwd, O, P, ID_1) - else: - assert rev == 3 - U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) - encrypt = DictionaryObject() - encrypt[NameObject(SA.FILTER)] = NameObject("/Standard") - encrypt[NameObject("/V")] = NumberObject(V) - if V == 2: - encrypt[NameObject(SA.LENGTH)] = NumberObject(keylen * 8) - encrypt[NameObject("/R")] = NumberObject(rev) - encrypt[NameObject("/O")] = ByteStringObject(O) - encrypt[NameObject("/U")] = ByteStringObject(U) - encrypt[NameObject("/P")] = NumberObject(P) - self._encrypt = self._addObject(encrypt) - self._encrypt_key = key - - def write(self, stream): - """ - Writes the collection of pages added to this object out as a PDF file. - - :param stream: An object to write the file to. The object must support - the write method and the tell method, similar to a file object. - """ - if hasattr(stream, "mode") and "b" not in stream.mode: - warnings.warn( - "File <%s> to write to is not in binary mode. It may not be written to correctly." - % stream.name - ) - - if not self._root: - self._root = self._addObject(self._root_object) - - external_reference_map = {} - - # PDF objects sometimes have circular references to their /Page objects - # inside their object tree (for example, annotations). Those will be - # indirect references to objects that we've recreated in this PDF. To - # address this problem, PageObject's store their original object - # reference number, and we add it to the external reference map before - # we sweep for indirect references. This forces self-page-referencing - # trees to reference the correct new object location, rather than - # copying in a new copy of the page object. - for obj_index in range(len(self._objects)): - obj = self._objects[obj_index] - if isinstance(obj, PageObject) and obj.indirectRef is not None: - data = obj.indirectRef - if data.pdf not in external_reference_map: - external_reference_map[data.pdf] = {} - if data.generation not in external_reference_map[data.pdf]: - external_reference_map[data.pdf][data.generation] = {} - external_reference_map[data.pdf][data.generation][ - data.idnum - ] = IndirectObject(obj_index + 1, 0, self) - - self.stack = [] - self._sweepIndirectReferences(external_reference_map, self._root) - del self.stack - - object_positions = self._write_header(stream) - xref_location = self._write_xref_table(stream, object_positions) - self._write_trailer(stream) - stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) # eof - - def _write_header(self, stream): - object_positions = [] - stream.write(self._header + b_("\n")) - stream.write(b_("%\xE2\xE3\xCF\xD3\n")) - for i in range(len(self._objects)): - obj = self._objects[i] - # If the obj is None we can't write anything - if obj is not None: - idnum = i + 1 - object_positions.append(stream.tell()) - stream.write(b_(str(idnum) + " 0 obj\n")) - key = None - if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: - pack1 = struct.pack("` for details. - """ - page_ref = self.getObject(self._pages)[PA.KIDS][pagenum] - action = DictionaryObject() - zoom_args = [] - for a in args: - if a is not None: - zoom_args.append(NumberObject(a)) - else: - zoom_args.append(NullObject()) - dest = Destination( - NameObject("/" + title + " bookmark"), page_ref, NameObject(fit), *zoom_args - ) - dest_array = dest.getDestArray() - action.update( - {NameObject("/D"): dest_array, NameObject("/S"): NameObject("/GoTo")} - ) - action_ref = self._addObject(action) - - outline_ref = self.getOutlineRoot() - - if parent is None: - parent = outline_ref - - bookmark = TreeObject() - - bookmark.update( - { - NameObject("/A"): action_ref, - NameObject("/Title"): createStringObject(title), - } - ) - - if color is not None: - bookmark.update( - {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} - ) - - format = 0 - if italic: - format += 1 - if bold: - format += 2 - if format: - bookmark.update({NameObject("/F"): NumberObject(format)}) - - bookmark_ref = self._addObject(bookmark) - - parent = parent.getObject() - parent.addChild(bookmark_ref, self) - - return bookmark_ref - - def addNamedDestinationObject(self, dest): - dest_ref = self._addObject(dest) - - nd = self.getNamedDestRoot() - nd.extend([dest["/Title"], dest_ref]) - - return dest_ref - - def addNamedDestination(self, title, pagenum): - page_ref = self.getObject(self._pages)[PA.KIDS][pagenum] - dest = DictionaryObject() - dest.update( - { - NameObject("/D"): ArrayObject( - [page_ref, NameObject("/FitH"), NumberObject(826)] - ), - NameObject("/S"): NameObject("/GoTo"), - } - ) - - dest_ref = self._addObject(dest) - nd = self.getNamedDestRoot() - - nd.extend([title, dest_ref]) - - return dest_ref - - def removeLinks(self): - """ - Removes links and annotations from this output. - """ - pages = self.getObject(self._pages)[PA.KIDS] - for page in pages: - page_ref = self.getObject(page) - if PG.ANNOTS in page_ref: - del page_ref[PG.ANNOTS] - - def removeImages(self, ignoreByteStringObject=False): - """ - Removes images from this output. - - :param bool ignoreByteStringObject: optional parameter - to ignore ByteString Objects. - """ - pages = self.getObject(self._pages)[PA.KIDS] - jump_operators = [ - b_("cm"), - b_("w"), - b_("J"), - b_("j"), - b_("M"), - b_("d"), - b_("ri"), - b_("i"), - b_("gs"), - b_("W"), - b_("b"), - b_("s"), - b_("S"), - b_("f"), - b_("F"), - b_("n"), - b_("m"), - b_("l"), - b_("c"), - b_("v"), - b_("y"), - b_("h"), - b_("B"), - b_("Do"), - b_("sh"), - ] - for j in range(len(pages)): - page = pages[j] - page_ref = self.getObject(page) - content = page_ref["/Contents"].getObject() - if not isinstance(content, ContentStream): - content = ContentStream(content, page_ref) - - _operations = [] - seq_graphics = False - for operands, operator in content.operations: - if operator in [b_("Tj"), b_("'")]: - text = operands[0] - if ignoreByteStringObject: - if not isinstance(text, TextStringObject): - operands[0] = TextStringObject() - elif operator == b_('"'): - text = operands[2] - if ignoreByteStringObject and not isinstance( - text, TextStringObject - ): - operands[2] = TextStringObject() - elif operator == b_("TJ"): - for i in range(len(operands[0])): - if ignoreByteStringObject and not isinstance( - operands[0][i], TextStringObject - ): - operands[0][i] = TextStringObject() - - if operator == b_("q"): - seq_graphics = True - if operator == b_("Q"): - seq_graphics = False - if seq_graphics and operator in jump_operators: - continue - if operator == b_("re"): - continue - _operations.append((operands, operator)) - - content.operations = _operations - page_ref.__setitem__(NameObject("/Contents"), content) - - def removeText(self, ignoreByteStringObject=False): - """ - Removes text from this output. - - :param bool ignoreByteStringObject: optional parameter - to ignore ByteString Objects. - """ - pages = self.getObject(self._pages)[PA.KIDS] - for j in range(len(pages)): - page = pages[j] - page_ref = self.getObject(page) - content = page_ref["/Contents"].getObject() - if not isinstance(content, ContentStream): - content = ContentStream(content, page_ref) - for operands, operator in content.operations: - if operator in [b_("Tj"), b_("'")]: - text = operands[0] - if not ignoreByteStringObject: - if isinstance(text, TextStringObject): - operands[0] = TextStringObject() - else: - if isinstance(text, (TextStringObject, ByteStringObject)): - operands[0] = TextStringObject() - elif operator == b_('"'): - text = operands[2] - if not ignoreByteStringObject: - if isinstance(text, TextStringObject): - operands[2] = TextStringObject() - else: - if isinstance(text, (TextStringObject, ByteStringObject)): - operands[2] = TextStringObject() - elif operator == b_("TJ"): - for i in range(len(operands[0])): - if not ignoreByteStringObject: - if isinstance(operands[0][i], TextStringObject): - operands[0][i] = TextStringObject() - else: - if isinstance( - operands[0][i], (TextStringObject, ByteStringObject) - ): - operands[0][i] = TextStringObject() - - page_ref.__setitem__(NameObject("/Contents"), content) - - def addURI(self, pagenum, uri, rect, border=None): - """ - Add an URI from a rectangular area to the specified page. - This uses the basic structure of AddLink - - :param int pagenum: index of the page on which to place the URI action. - :param int uri: string -- uri of resource to link to. - :param rect: :class:`RectangleObject` or array of four - integers specifying the clickable rectangular area - ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. - :param border: if provided, an array describing border-drawing - properties. See the PDF spec for details. No border will be - drawn if this argument is omitted. - - REMOVED FIT/ZOOM ARG - -John Mulligan - """ - - page_link = self.getObject(self._pages)[PA.KIDS][pagenum] - page_ref = self.getObject(page_link) - - if border is not None: - border_arr = [NameObject(n) for n in border[:3]] - if len(border) == 4: - dash_pattern = ArrayObject([NameObject(n) for n in border[3]]) - border_arr.append(dash_pattern) - else: - border_arr = [NumberObject(2)] * 3 - - if isString(rect): - rect = NameObject(rect) - elif isinstance(rect, RectangleObject): - pass - else: - rect = RectangleObject(rect) - - lnk2 = DictionaryObject() - lnk2.update( - { - NameObject("/S"): NameObject("/URI"), - NameObject("/URI"): TextStringObject(uri), - } - ) - lnk = DictionaryObject() - lnk.update( - { - NameObject("/Type"): NameObject(PG.ANNOTS), - NameObject("/Subtype"): NameObject("/Link"), - NameObject("/P"): page_link, - NameObject("/Rect"): rect, - NameObject("/H"): NameObject("/I"), - NameObject("/Border"): ArrayObject(border_arr), - NameObject("/A"): lnk2, - } - ) - lnk_ref = self._addObject(lnk) - - if PG.ANNOTS in page_ref: - page_ref[PG.ANNOTS].append(lnk_ref) - else: - page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) - - def addLink(self, pagenum, pagedest, rect, border=None, fit="/Fit", *args): - """ - Add an internal link from a rectangular area to the specified page. - - :param int pagenum: index of the page on which to place the link. - :param int pagedest: index of the page to which the link should go. - :param rect: :class:`RectangleObject` or array of four - integers specifying the clickable rectangular area - ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. - :param border: if provided, an array describing border-drawing - properties. See the PDF spec for details. No border will be - drawn if this argument is omitted. - :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need - to be supplied. Passing ``None`` will be read as a null value for that coordinate. - - .. list-table:: Valid ``zoom`` arguments (see Table 8.2 of the PDF 1.7 reference for details) - :widths: 50 200 - - * - /Fit - - No additional arguments - * - /XYZ - - [left] [top] [zoomFactor] - * - /FitH - - [top] - * - /FitV - - [left] - * - /FitR - - [left] [bottom] [right] [top] - * - /FitB - - No additional arguments - * - /FitBH - - [top] - * - /FitBV - - [left] - """ - - page_link = self.getObject(self._pages)[PA.KIDS][pagenum] - page_dest = self.getObject(self._pages)[PA.KIDS][ - pagedest - ] # TODO: switch for external link - page_ref = self.getObject(page_link) - - if border is not None: - border_arr = [NameObject(n) for n in border[:3]] - if len(border) == 4: - dash_pattern = ArrayObject([NameObject(n) for n in border[3]]) - border_arr.append(dash_pattern) - else: - border_arr = [NumberObject(0)] * 3 - - if isString(rect): - rect = NameObject(rect) - elif isinstance(rect, RectangleObject): - pass - else: - rect = RectangleObject(rect) - - zoom_args = [] - for a in args: - if a is not None: - zoom_args.append(NumberObject(a)) - else: - zoom_args.append(NullObject()) - dest = Destination( - NameObject("/LinkName"), page_dest, NameObject(fit), *zoom_args - ) # TODO: create a better name for the link - dest_array = dest.getDestArray() - - lnk = DictionaryObject() - lnk.update( - { - NameObject("/Type"): NameObject(PG.ANNOTS), - NameObject("/Subtype"): NameObject("/Link"), - NameObject("/P"): page_link, - NameObject("/Rect"): rect, - NameObject("/Border"): ArrayObject(border_arr), - NameObject("/Dest"): dest_array, - } - ) - lnk_ref = self._addObject(lnk) - - if PG.ANNOTS in page_ref: - page_ref[PG.ANNOTS].append(lnk_ref) - else: - page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) - - _valid_layouts = [ - "/NoLayout", - "/SinglePage", - "/OneColumn", - "/TwoColumnLeft", - "/TwoColumnRight", - "/TwoPageLeft", - "/TwoPageRight", - ] - - def getPageLayout(self): - """ - Get the page layout. - See :meth:`setPageLayout()` for a description of valid layouts. - - :return: Page layout currently being used. - :rtype: str, None if not specified - """ - try: - return self._root_object["/PageLayout"] - except KeyError: - return None - - def setPageLayout(self, layout): - """ - Set the page layout. - - :param str layout: The page layout to be used. - - .. list-table:: Valid ``layout`` arguments - :widths: 50 200 - - * - /NoLayout - - Layout explicitly not specified - * - /SinglePage - - Show one page at a time - * - /OneColumn - - Show one column at a time - * - /TwoColumnLeft - - Show pages in two columns, odd-numbered pages on the left - * - /TwoColumnRight - - Show pages in two columns, odd-numbered pages on the right - * - /TwoPageLeft - - Show two pages at a time, odd-numbered pages on the left - * - /TwoPageRight - - Show two pages at a time, odd-numbered pages on the right - """ - if not isinstance(layout, NameObject): - if layout not in self._valid_layouts: - warnings.warn( - "Layout should be one of: {}".format(", ".join(self._valid_layouts)) - ) - layout = NameObject(layout) - self._root_object.update({NameObject("/PageLayout"): layout}) - - pageLayout = property(getPageLayout, setPageLayout) - """Read and write property accessing the :meth:`getPageLayout()` - and :meth:`setPageLayout()` methods.""" - - _valid_modes = [ - "/UseNone", - "/UseOutlines", - "/UseThumbs", - "/FullScreen", - "/UseOC", - "/UseAttachments", - ] - - def getPageMode(self): - """ - Get the page mode. - See :meth:`setPageMode()` for a description - of valid modes. - - :return: Page mode currently being used. - :rtype: str, None if not specified. - """ - try: - return self._root_object["/PageMode"] - except KeyError: - return None - - def setPageMode(self, mode): - """ - Set the page mode. - - :param str mode: The page mode to use. - - .. list-table:: Valid ``mode`` arguments - :widths: 50 200 - - * - /UseNone - - Do not show outlines or thumbnails panels - * - /UseOutlines - - Show outlines (aka bookmarks) panel - * - /UseThumbs - - Show page thumbnails panel - * - /FullScreen - - Fullscreen view - * - /UseOC - - Show Optional Content Group (OCG) panel - * - /UseAttachments - - Show attachments panel - """ - if not isinstance(mode, NameObject): - if mode not in self._valid_modes: - warnings.warn( - "Mode should be one of: {}".format(", ".join(self._valid_modes)) - ) - mode = NameObject(mode) - self._root_object.update({NameObject("/PageMode"): mode}) - - pageMode = property(getPageMode, setPageMode) - """Read and write property accessing the :meth:`getPageMode()` - and :meth:`setPageMode()` methods.""" - - -class PdfFileReader(object): - """ - Initializes a PdfFileReader object. This operation can take some time, as - the PDF stream's cross-reference tables are read into memory. - - :param stream: A File object or an object that supports the standard read - and seek methods similar to a File object. Could also be a - string representing a path to a PDF file. - :param bool strict: Determines whether user should be warned of all - problems and also causes some correctable problems to be fatal. - Defaults to ``True``. - :param warndest: Destination for logging warnings (defaults to - ``sys.stderr``). - :param bool overwriteWarnings: Determines whether to override Python's - ``warnings.py`` module with a custom implementation (defaults to - ``True``). - """ - - def __init__(self, stream, strict=True, warndest=None, overwriteWarnings=True): - if overwriteWarnings: - # Have to dynamically override the default showwarning since there are no - # public methods that specify the 'file' parameter - def _showwarning( - message, category, filename, lineno, file=warndest, line=None - ): - if file is None: - file = sys.stderr - try: - # It is possible for sys.stderr to be defined as None, most commonly in the case that the script - # is being run vida pythonw.exe on Windows. In this case, just swallow the warning. - # See also https://docs.python.org/3/library/sys.html# sys.__stderr__ - if file is not None: - file.write( - formatWarning(message, category, filename, lineno, line) - ) - except IOError: - pass - - warnings.showwarning = _showwarning - self.strict = strict - self.flattenedPages = None - self.resolvedObjects = {} - self.xrefIndex = 0 - self._pageId2Num = None # map page IndirectRef number to Page Number - if hasattr(stream, "mode") and "b" not in stream.mode: - warnings.warn( - "PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", - PdfReadWarning, - ) - if isString(stream): - with open(stream, "rb") as fileobj: - stream = BytesIO(b_(fileobj.read())) - self.read(stream) - self.stream = stream - - self._override_encryption = False - - def getDocumentInfo(self): - """ - Retrieves the PDF file's document information dictionary, if it exists. - Note that some PDF files use metadata streams instead of docinfo - dictionaries, and these metadata streams will not be accessed by this - function. - - :return: the document information of this PDF file - :rtype: :class:`DocumentInformation` or ``None`` if none exists. - """ - if TK.INFO not in self.trailer: - return None - obj = self.trailer[TK.INFO] - retval = DocumentInformation() - retval.update(obj) - return retval - - @property - def documentInfo(self): - """Read-only property that accesses the :meth:`getDocumentInfo()` function.""" - return self.getDocumentInfo() - - def getXmpMetadata(self): - """ - Retrieves XMP (Extensible Metadata Platform) data from the PDF document - root. - - :return: a :class:`XmpInformation` - instance that can be used to access XMP metadata from the document. - :rtype: :class:`XmpInformation` or - ``None`` if no metadata was found on the document root. - """ - try: - self._override_encryption = True - return self.trailer[TK.ROOT].getXmpMetadata() - finally: - self._override_encryption = False - - @property - def xmpMetadata(self): - """ - Read-only property that accesses the - :meth:`getXmpMetadata()` function. - """ - return self.getXmpMetadata() - - def getNumPages(self): - """ - Calculates the number of pages in this PDF file. - - :return: number of pages - :rtype: int - :raises PdfReadError: if file is encrypted and restrictions prevent - this action. - """ - - # Flattened pages will not work on an Encrypted PDF; - # the PDF file's page count is used in this case. Otherwise, - # the original method (flattened page count) is used. - if self.isEncrypted: - try: - self._override_encryption = True - self.decrypt("") - return self.trailer[TK.ROOT]["/Pages"]["/Count"] - except Exception: - raise PdfReadError("File has not been decrypted") - finally: - self._override_encryption = False - else: - if self.flattenedPages is None: - self._flatten() - return len(self.flattenedPages) - - @property - def numPages(self): - """ - Read-only property that accesses the - :meth:`getNumPages()` function. - """ - return self.getNumPages() - - def getPage(self, pageNumber): - """ - Retrieves a page by number from this PDF file. - - :param int pageNumber: The page number to retrieve - (pages begin at zero) - :return: a :class:`PageObject` instance. - :rtype: :class:`PageObject` - """ - # ensure that we're not trying to access an encrypted PDF - # assert not self.trailer.has_key(TK.ENCRYPT) - if self.flattenedPages is None: - self._flatten() - return self.flattenedPages[pageNumber] - - @property - def namedDestinations(self): - """ - Read-only property that accesses the - :meth:`getNamedDestinations()` function. - """ - return self.getNamedDestinations() - - # A select group of relevant field attributes. For the complete list, - # see section 8.6.2 of the PDF 1.7 reference. - - def getFields(self, tree=None, retval=None, fileobj=None): - """ - Extracts field data if this PDF contains interactive form fields. - The *tree* and *retval* parameters are for recursive use. - - :param fileobj: A file object (usually a text file) to write - a report to on all interactive form fields found. - :return: A dictionary where each key is a field name, and each - value is a :class:`Field` object. By - default, the mapping name is used for keys. - :rtype: dict, or ``None`` if form data could not be located. - """ - field_attributes = { - "/FT": "Field Type", - PA.PARENT: "Parent", - "/T": "Field Name", - "/TU": "Alternate Field Name", - "/TM": "Mapping Name", - "/Ff": "Field Flags", - "/V": "Value", - "/DV": "Default Value", - } - if retval is None: - retval = {} - catalog = self.trailer[TK.ROOT] - # get the AcroForm tree - if "/AcroForm" in catalog: - tree = catalog["/AcroForm"] - else: - return None - if tree is None: - return retval - - self._checkKids(tree, retval, fileobj) - for attr in field_attributes: - if attr in tree: - # Tree is a field - self._buildField(tree, retval, fileobj, field_attributes) - break - - if "/Fields" in tree: - fields = tree["/Fields"] - for f in fields: - field = f.getObject() - self._buildField(field, retval, fileobj, field_attributes) - - return retval - - def _buildField(self, field, retval, fileobj, fieldAttributes): - self._checkKids(field, retval, fileobj) - try: - key = field["/TM"] - except KeyError: - try: - key = field["/T"] - except KeyError: - # Ignore no-name field for now - return - if fileobj: - self._writeField(fileobj, field, fieldAttributes) - fileobj.write("\n") - retval[key] = Field(field) - - def _checkKids(self, tree, retval, fileobj): - if PA.KIDS in tree: - # recurse down the tree - for kid in tree[PA.KIDS]: - self.getFields(kid.getObject(), retval, fileobj) - - def _writeField(self, fileobj, field, fieldAttributes): - order = ["/TM", "/T", "/FT", PA.PARENT, "/TU", "/Ff", "/V", "/DV"] - for attr in order: - attr_name = fieldAttributes[attr] - try: - if attr == "/FT": - # Make the field type value more clear - types = { - "/Btn": "Button", - "/Tx": "Text", - "/Ch": "Choice", - "/Sig": "Signature", - } - if field[attr] in types: - fileobj.write(attr_name + ": " + types[field[attr]] + "\n") - elif attr == PA.PARENT: - # Let's just write the name of the parent - try: - name = field[PA.PARENT]["/TM"] - except KeyError: - name = field[PA.PARENT]["/T"] - fileobj.write(attr_name + ": " + name + "\n") - else: - fileobj.write(attr_name + ": " + str(field[attr]) + "\n") - except KeyError: - # Field attribute is N/A or unknown, so don't write anything - pass - - def getFormTextFields(self): - """Retrieves form fields from the document with textual data (inputs, dropdowns)""" - # Retrieve document form fields - formfields = self.getFields() - if formfields is None: - return {} - return { - formfields[field]["/T"]: formfields[field].get("/V") - for field in formfields - if formfields[field].get("/FT") == "/Tx" - } - - def getNamedDestinations(self, tree=None, retval=None): - """ - Retrieves the named destinations present in the document. - - :return: a dictionary which maps names to - :class:`Destinations`. - :rtype: dict - """ - if retval is None: - retval = {} - catalog = self.trailer[TK.ROOT] - - # get the name tree - if CA.DESTS in catalog: - tree = catalog[CA.DESTS] - elif CA.NAMES in catalog: - names = catalog[CA.NAMES] - if CA.DESTS in names: - tree = names[CA.DESTS] - - if tree is None: - return retval - - if PA.KIDS in tree: - # recurse down the tree - for kid in tree[PA.KIDS]: - self.getNamedDestinations(kid.getObject(), retval) - - if CA.NAMES in tree: - names = tree[CA.NAMES] - for i in range(0, len(names), 2): - key = names[i].getObject() - val = names[i + 1].getObject() - if isinstance(val, DictionaryObject) and "/D" in val: - val = val["/D"] - dest = self._buildDestination(key, val) - if dest is not None: - retval[key] = dest - - return retval - - @property - def outlines(self): - """ - Read-only property that accesses the - :meth:`getOutlines()` function. - """ - return self.getOutlines() - - def getOutlines(self, node=None, outlines=None): - """ - Retrieves the document outline present in the document. - - :return: a nested list of :class:`Destinations`. - """ - if outlines is None: - outlines = [] - catalog = self.trailer[TK.ROOT] - - # get the outline dictionary and named destinations - if CO.OUTLINES in catalog: - try: - lines = catalog[CO.OUTLINES] - except PdfReadError: - # this occurs if the /Outlines object reference is incorrect - # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf - # so continue to load the file without the Bookmarks - return outlines - - if "/First" in lines: - node = lines["/First"] - self._namedDests = self.getNamedDestinations() - - if node is None: - return outlines - - # see if there are any more outlines - while True: - outline = self._buildOutline(node) - if outline: - outlines.append(outline) - - # check for sub-outlines - if "/First" in node: - sub_outlines = [] - self.getOutlines(node["/First"], sub_outlines) - if sub_outlines: - outlines.append(sub_outlines) - - if "/Next" not in node: - break - node = node["/Next"] - - return outlines - - def _getPageNumberByIndirect(self, indirectRef): - """Generate _pageId2Num""" - if self._pageId2Num is None: - id2num = {} - for i, x in enumerate(self.pages): - id2num[x.indirectRef.idnum] = i - self._pageId2Num = id2num - - if isinstance(indirectRef, int): - idnum = indirectRef - else: - idnum = indirectRef.idnum - - ret = self._pageId2Num.get(idnum, -1) - return ret - - def getPageNumber(self, page): - """ - Retrieve page number of a given PageObject - - :param PageObject page: The page to get page number. Should be - an instance of :class:`PageObject` - :return: the page number or -1 if page not found - :rtype: int - """ - indirect_ref = page.indirectRef - ret = self._getPageNumberByIndirect(indirect_ref) - return ret - - def getDestinationPageNumber(self, destination): - """ - Retrieve page number of a given Destination object - - :param Destination destination: The destination to get page number. - Should be an instance of - :class:`Destination` - :return: the page number or -1 if page not found - :rtype: int - """ - indirect_ref = destination.page - ret = self._getPageNumberByIndirect(indirect_ref) - return ret - - def _buildDestination(self, title, array): - page, typ = array[0:2] - array = array[2:] - return Destination(title, page, typ, *array) - - def _buildOutline(self, node): - dest, title, outline = None, None, None - - if "/A" in node and "/Title" in node: - # Action, section 8.5 (only type GoTo supported) - title = node["/Title"] - action = node["/A"] - if action["/S"] == "/GoTo": - dest = action["/D"] - elif "/Dest" in node and "/Title" in node: - # Destination, section 8.2.1 - title = node["/Title"] - dest = node["/Dest"] - - # if destination found, then create outline - if dest: - if isinstance(dest, ArrayObject): - outline = self._buildDestination(title, dest) - elif isString(dest) and dest in self._namedDests: - outline = self._namedDests[dest] - outline[NameObject("/Title")] = title - else: - raise PdfReadError("Unexpected destination %r" % dest) - return outline - - @property - def pages(self): - """ - Read-only property that emulates a list based upon the - :meth:`getNumPages()` and - :meth:`getPage()` methods. - """ - return ConvertFunctionsToVirtualList(self.getNumPages, self.getPage) - - def getPageLayout(self): - """ - Get the page layout. - See :meth:`setPageLayout()` - for a description of valid layouts. - - :return: Page layout currently being used. - :rtype: ``str``, ``None`` if not specified - """ - try: - return self.trailer[TK.ROOT]["/PageLayout"] - except KeyError: - return None - - @property - def pageLayout(self): - """Read-only property accessing the - :meth:`getPageLayout()` method.""" - return self.getPageLayout() - - def getPageMode(self): - """ - Get the page mode. - See :meth:`setPageMode()` - for a description of valid modes. - - :return: Page mode currently being used. - :rtype: ``str``, ``None`` if not specified - """ - try: - return self.trailer[TK.ROOT]["/PageMode"] - except KeyError: - return None - - @property - def pageMode(self): - """Read-only property accessing the - :meth:`getPageMode()` method.""" - return self.getPageMode() - - def _flatten(self, pages=None, inherit=None, indirectRef=None): - inheritablePageAttributes = ( - NameObject(PG.RESOURCES), - NameObject(PG.MEDIABOX), - NameObject(PG.CROPBOX), - NameObject(PG.ROTATE), - ) - if inherit is None: - inherit = {} - if pages is None: - # Fix issue 327: set flattenedPages attribute only for - # decrypted file - catalog = self.trailer[TK.ROOT].getObject() - pages = catalog["/Pages"].getObject() - self.flattenedPages = [] - - t = "/Pages" - if PA.TYPE in pages: - t = pages[PA.TYPE] - - if t == "/Pages": - for attr in inheritablePageAttributes: - if attr in pages: - inherit[attr] = pages[attr] - for page in pages[PA.KIDS]: - addt = {} - if isinstance(page, IndirectObject): - addt["indirectRef"] = page - self._flatten(page.getObject(), inherit, **addt) - elif t == "/Page": - for attr, value in list(inherit.items()): - # if the page has it's own value, it does not inherit the - # parent's value: - if attr not in pages: - pages[attr] = value - page_obj = PageObject(self, indirectRef) - page_obj.update(pages) - self.flattenedPages.append(page_obj) - - def _getObjectFromStream(self, indirectReference): - # indirect reference to object in object stream - # read the entire object stream into memory - stmnum, idx = self.xref_objStm[indirectReference.idnum] - obj_stm = IndirectObject(stmnum, 0, self).getObject() - # This is an xref to a stream, so its type better be a stream - assert obj_stm["/Type"] == "/ObjStm" - # /N is the number of indirect objects in the stream - assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.getData())) - for i in range(obj_stm["/N"]): - readNonWhitespace(stream_data) - stream_data.seek(-1, 1) - objnum = NumberObject.readFromStream(stream_data) - readNonWhitespace(stream_data) - stream_data.seek(-1, 1) - offset = NumberObject.readFromStream(stream_data) - readNonWhitespace(stream_data) - stream_data.seek(-1, 1) - if objnum != indirectReference.idnum: - # We're only interested in one object - continue - if self.strict and idx != i: - raise PdfReadError("Object is in wrong index.") - stream_data.seek(obj_stm["/First"] + offset, 0) - try: - obj = readObject(stream_data, self) - except PdfStreamError as e: - # Stream object cannot be read. Normally, a critical error, but - # Adobe Reader doesn't complain, so continue (in strict mode?) - e = sys.exc_info()[1] - warnings.warn( - "Invalid stream (index %d) within object %d %d: %s" - % (i, indirectReference.idnum, indirectReference.generation, e), - PdfReadWarning, - ) - - if self.strict: - raise PdfReadError("Can't read object stream: %s" % e) - # Replace with null. Hopefully it's nothing important. - obj = NullObject() - return obj - - if self.strict: - raise PdfReadError("This is a fatal error in strict mode.") - return NullObject() - - def getObject(self, indirectReference): - retval = self.cacheGetIndirectObject( - indirectReference.generation, indirectReference.idnum - ) - if retval is not None: - return retval - if ( - indirectReference.generation == 0 - and indirectReference.idnum in self.xref_objStm - ): - retval = self._getObjectFromStream(indirectReference) - elif ( - indirectReference.generation in self.xref - and indirectReference.idnum in self.xref[indirectReference.generation] - ): - start = self.xref[indirectReference.generation][indirectReference.idnum] - self.stream.seek(start, 0) - idnum, generation = self.readObjectHeader(self.stream) - if idnum != indirectReference.idnum and self.xrefIndex: - # Xref table probably had bad indexes due to not being zero-indexed - if self.strict: - raise PdfReadError( - "Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." - % ( - indirectReference.idnum, - indirectReference.generation, - idnum, - generation, - ) - ) - else: - pass # xref table is corrected in non-strict mode - elif idnum != indirectReference.idnum and self.strict: - # some other problem - raise PdfReadError( - "Expected object ID (%d %d) does not match actual (%d %d)." - % ( - indirectReference.idnum, - indirectReference.generation, - idnum, - generation, - ) - ) - if self.strict: - assert generation == indirectReference.generation - retval = readObject(self.stream, self) - - # override encryption is used for the /Encrypt dictionary - if not self._override_encryption and self.isEncrypted: - # if we don't have the encryption key: - if not hasattr(self, "_decryption_key"): - raise PdfReadError("file has not been decrypted") - # otherwise, decrypt here... - pack1 = struct.pack("= 3 - if self.strict and len(entry_sizes) > 3: - raise PdfReadError("Too many entry sizes: %s" % entry_sizes) - - def get_entry(i): - # Reads the correct number of bytes for each entry. See the - # discussion of the W parameter in PDF spec table 17. - if entry_sizes[i] > 0: - d = stream_data.read(entry_sizes[i]) - return convertToInt(d, entry_sizes[i]) - - # PDF Spec Table 17: A value of zero for an element in the - # W array indicates...the default value shall be used - if i == 0: - return 1 # First value defaults to 1 - else: - return 0 - - def used_before(num, generation): - # We move backwards through the xrefs, don't replace any. - return num in self.xref.get(generation, []) or num in self.xref_objStm - - # Iterate through each subsection - self._read_xref_subsections(idx_pairs, get_entry, used_before) - return xrefstream - - @staticmethod - def _get_xref_issues(stream, startxref): - """Returns an int which indicates an issue. 0 means there is no issue.""" - stream.seek(startxref - 1, 0) # -1 to check character before - line = stream.read(1) - if line not in b_("\r\n \t"): - return 1 - line = stream.read(4) - if line != b_("xref"): - # not an xref so check if it is an XREF object - line = b_("") - while line in b_("0123456789 \t"): - line = stream.read(1) - if line == b_(""): - return 2 - line += stream.read(2) # 1 char already read, +2 to check "obj" - if line.lower() != b_("obj"): - return 3 - while stream.read(1) in b_(" \t\r\n"): - pass - line = stream.read(256) # check that it is xref obj - if b_("/xref") not in line.lower(): - return 4 - return 0 - - def _rebuild_xref_table(self, stream): - self.xref = {} - stream.seek(0, 0) - f_ = stream.read(-1) - import re - - for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_): - idnum = int(m.group(1)) - generation = int(m.group(2)) - if generation not in self.xref: - self.xref[generation] = {} - self.xref[generation][idnum] = m.start(1) - trailer_pos = f_.rfind(b"trailer") - len(f_) + 7 - stream.seek(trailer_pos, 2) - # code below duplicated - readNonWhitespace(stream) - stream.seek(-1, 1) - new_trailer = readObject(stream, self) - for key, value in list(new_trailer.items()): - if key not in self.trailer: - self.trailer[key] = value - - def _read_xref_subsections(self, idx_pairs, getEntry, used_before): - last_end = 0 - for start, size in self._pairs(idx_pairs): - # The subsections must increase - assert start >= last_end - last_end = start + size - for num in range(start, start + size): - # The first entry is the type - xref_type = getEntry(0) - # The rest of the elements depend on the xref_type - if xref_type == 0: - # linked list of free objects - next_free_object = getEntry(1) # noqa: F841 - next_generation = getEntry(2) # noqa: F841 - elif xref_type == 1: - # objects that are in use but are not compressed - byte_offset = getEntry(1) - generation = getEntry(2) - if generation not in self.xref: - self.xref[generation] = {} - if not used_before(num, generation): - self.xref[generation][num] = byte_offset - elif xref_type == 2: - # compressed objects - objstr_num = getEntry(1) - obstr_idx = getEntry(2) - generation = 0 # PDF spec table 18, generation is 0 - if not used_before(num, generation): - self.xref_objStm[num] = (objstr_num, obstr_idx) - elif self.strict: - raise PdfReadError("Unknown xref type: %s" % xref_type) - - def _zeroXref(self, generation): - self.xref[generation] = { - k - self.xrefIndex: v for (k, v) in list(self.xref[generation].items()) - } - - def _pairs(self, array): - i = 0 - while True: - yield array[i], array[i + 1] - i += 2 - if (i + 1) >= len(array): - break - - def readNextEndLine(self, stream, limit_offset=0): - line_parts = [] - while True: - # Prevent infinite loops in malformed PDFs - if stream.tell() == 0 or stream.tell() == limit_offset: - raise PdfReadError("Could not read malformed PDF file") - x = stream.read(1) - if stream.tell() < 2: - raise PdfReadError("EOL marker not found") - stream.seek(-2, 1) - if x == b_("\n") or x == b_("\r"): ## \n = LF; \r = CR - crlf = False - while x == b_("\n") or x == b_("\r"): - x = stream.read(1) - if x == b_("\n") or x == b_("\r"): # account for CR+LF - stream.seek(-1, 1) - crlf = True - if stream.tell() < 2: - raise PdfReadError("EOL marker not found") - stream.seek(-2, 1) - stream.seek( - 2 if crlf else 1, 1 - ) # if using CR+LF, go back 2 bytes, else 1 - break - else: - line_parts.append(x) - line_parts.reverse() - return b"".join(line_parts) - - def decrypt(self, password): - """ - When using an encrypted / secured PDF file with the PDF Standard - encryption handler, this function will allow the file to be decrypted. - It checks the given password against the document's user password and - owner password, and then stores the resulting decryption key if either - password is correct. - - It does not matter which password was matched. Both passwords provide - the correct decryption key that will allow the document to be used with - this library. - - :param str password: The password to match. - :return: ``0`` if the password failed, ``1`` if the password matched the user - password, and ``2`` if the password matched the owner password. - :rtype: int - :raises NotImplementedError: if document uses an unsupported encryption - method. - """ - - self._override_encryption = True - try: - return self._decrypt(password) - finally: - self._override_encryption = False - - def decode_permissions(self, permissions_code): - # Takes the permissions as an integer, returns the allowed access - permissions = {} - permissions["print"] = permissions_code & (1 << 3 - 1) != 0 # bit 3 - permissions["modify"] = permissions_code & (1 << 4 - 1) != 0 # bit 4 - permissions["copy"] = permissions_code & (1 << 5 - 1) != 0 # bit 5 - permissions["annotations"] = permissions_code & (1 << 6 - 1) != 0 # bit 6 - permissions["forms"] = permissions_code & (1 << 9 - 1) != 0 # bit 9 - permissions["accessability"] = permissions_code & (1 << 10 - 1) != 0 # bit 10 - permissions["assemble"] = permissions_code & (1 << 11 - 1) != 0 # bit 11 - permissions["print_high_quality"] = ( - permissions_code & (1 << 12 - 1) != 0 - ) # bit 12 - return permissions - - def _decrypt(self, password): - # Decrypts data as per Section 3.5 (page 117) of PDF spec v1.7 - # "The security handler defines the use of encryption and decryption in - # the document, using the rules specified by the CF, StmF, and StrF entries" - encrypt = self.trailer[TK.ENCRYPT].getObject() - # /Encrypt Keys: - # Filter (name) : "name of the preferred security handler " - # V (number) : Algorithm Code - # Length (integer): Length of encryption key, in bits - # CF (dictionary) : Crypt filter - # StmF (name) : Name of the crypt filter that is used by default when decrypting streams - # StrF (name) : The name of the crypt filter that is used when decrypting all strings in the document - # R (number) : Standard security handler revision number - # U (string) : A 32-byte string, based on the user password - # P (integer) : Permissions allowed with user access - if encrypt["/Filter"] != "/Standard": - raise NotImplementedError( - "only Standard PDF encryption handler is available" - ) - if not (encrypt["/V"] in (1, 2)): - raise NotImplementedError( - "only algorithm code 1 and 2 are supported. This PDF uses code %s" - % encrypt["/V"] - ) - user_password, key = self._authenticateUserPassword(password) - if user_password: - self._decryption_key = key - return 1 - else: - rev = encrypt["/R"].getObject() - if rev == 2: - keylen = 5 - else: - keylen = encrypt[SA.LENGTH].getObject() // 8 - key = _alg33_1(password, rev, keylen) - real_O = encrypt["/O"].getObject() - if rev == 2: - userpass = utils.RC4_encrypt(key, real_O) - else: - val = real_O - for i in range(19, -1, -1): - new_key = b_("") - for l in range(len(key)): - new_key += b_(chr(utils.ord_(key[l]) ^ i)) - val = utils.RC4_encrypt(new_key, val) - userpass = val - owner_password, key = self._authenticateUserPassword(userpass) - if owner_password: - self._decryption_key = key - return 2 - return 0 - - def _authenticateUserPassword(self, password): - encrypt = self.trailer[TK.ENCRYPT].getObject() - rev = encrypt["/R"].getObject() - owner_entry = encrypt["/O"].getObject() - p_entry = encrypt["/P"].getObject() - if TK.ID in self.trailer: - id_entry = self.trailer[TK.ID].getObject() - else: - # Some documents may not have a /ID, use two empty - # byte strings instead. Solves - # https://github.com/mstamy2/PyPDF2/issues/608 - id_entry = ArrayObject([ByteStringObject(b""), ByteStringObject(b"")]) - id1_entry = id_entry[0].getObject() - real_U = encrypt["/U"].getObject().original_bytes - if rev == 2: - U, key = _alg34(password, owner_entry, p_entry, id1_entry) - elif rev >= 3: - U, key = _alg35( - password, - rev, - encrypt[SA.LENGTH].getObject() // 8, - owner_entry, - p_entry, - id1_entry, - encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject(), - ) - U, real_U = U[:16], real_U[:16] - return U == real_U, key - - def getIsEncrypted(self): - return TK.ENCRYPT in self.trailer - - @property - def isEncrypted(self): - """ - Read-only boolean property showing whether this PDF file is encrypted. - Note that this property, if true, will remain true even after the - :meth:`decrypt()` method is called. - """ - return self.getIsEncrypted() - - -def getRectangle(self, name, defaults): - retval = self.get(name) - if isinstance(retval, RectangleObject): - return retval - if retval is None: - for d in defaults: - retval = self.get(d) - if retval is not None: - break - if isinstance(retval, IndirectObject): - retval = self.pdf.getObject(retval) - retval = RectangleObject(retval) - setRectangle(self, name, retval) - return retval - - -def setRectangle(self, name, value): - if not isinstance(name, NameObject): - name = NameObject(name) - self[name] = value - - -def deleteRectangle(self, name): - del self[name] - - -def createRectangleAccessor(name, fallback): - return property( - lambda self: getRectangle(self, name, fallback), - lambda self, value: setRectangle(self, name, value), - lambda self: deleteRectangle(self, name), - ) - - -class PageObject(DictionaryObject): - """ - This class represents a single page within a PDF file. Typically this - object will be created by accessing the - :meth:`getPage()` method of the - :class:`PdfFileReader` class, but it is - also possible to create an empty page with the - :meth:`createBlankPage()` static method. - - :param pdf: PDF file the page belongs to. - :param indirectRef: Stores the original indirect reference to - this object in its source PDF - """ - - def __init__(self, pdf=None, indirectRef=None): - DictionaryObject.__init__(self) - self.pdf = pdf - self.indirectRef = indirectRef - - @staticmethod - def createBlankPage(pdf=None, width=None, height=None): - """ - Returns a new blank page. - If ``width`` or ``height`` is ``None``, try to get the page size - from the last page of *pdf*. - - :param pdf: PDF file the page belongs to - :param float width: The width of the new page expressed in default user - space units. - :param float height: The height of the new page expressed in default user - space units. - :return: the new blank page: - :rtype: :class:`PageObject` - :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains - no page - """ - page = PageObject(pdf) - - # Creates a new page (cf PDF Reference 7.7.3.3) - page.__setitem__(NameObject("/Type"), NameObject("/Page")) - page.__setitem__(NameObject("/Parent"), NullObject()) - page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) - if width is None or height is None: - if pdf is not None and pdf.getNumPages() > 0: - lastpage = pdf.getPage(pdf.getNumPages() - 1) - width = lastpage.mediaBox.getWidth() - height = lastpage.mediaBox.getHeight() - else: - raise PageSizeNotDefinedError() - page.__setitem__( - NameObject(PG.MEDIABOX), RectangleObject([0, 0, width, height]) - ) - - return page - - def rotateClockwise(self, angle): - """ - Rotates a page clockwise by increments of 90 degrees. - - :param int angle: Angle to rotate the page. Must be an increment - of 90 deg. - """ - if angle % 90 != 0: - raise ValueError("Rotation angle must be a multiple of 90") - self._rotate(angle) - return self - - def rotateCounterClockwise(self, angle): - """ - Rotates a page counter-clockwise by increments of 90 degrees. - - :param int angle: Angle to rotate the page. Must be an increment - of 90 deg. - """ - if angle % 90 != 0: - raise ValueError("Rotation angle must be a multiple of 90") - self._rotate(-angle) - return self - - def _rotate(self, angle): - rotate_obj = self.get("/Rotate", 0) - current_angle = ( - rotate_obj if isinstance(rotate_obj, int) else rotate_obj.getObject() - ) - self[NameObject("/Rotate")] = NumberObject(current_angle + angle) - - @staticmethod - def _mergeResources(res1, res2, resource): - new_res = DictionaryObject() - new_res.update(res1.get(resource, DictionaryObject()).getObject()) - page2res = res2.get(resource, DictionaryObject()).getObject() - rename_res = {} - for key in list(page2res.keys()): - if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): - newname = NameObject(key + str(uuid.uuid4())) - rename_res[key] = newname - new_res[newname] = page2res[key] - elif key not in new_res: - new_res[key] = page2res.raw_get(key) - return new_res, rename_res - - @staticmethod - def _contentStreamRename(stream, rename, pdf): - if not rename: - return stream - stream = ContentStream(stream, pdf) - for operands, _operator in stream.operations: - if isinstance(operands, list): - for i in range(len(operands)): - op = operands[i] - if isinstance(op, NameObject): - operands[i] = rename.get(op, op) - elif isinstance(operands, dict): - for i in operands: - op = operands[i] - if isinstance(op, NameObject): - operands[i] = rename.get(op, op) - else: - raise KeyError("type of operands is %s" % type(operands)) - return stream - - @staticmethod - def _pushPopGS(contents, pdf): - # adds a graphics state "push" and "pop" to the beginning and end - # of a content stream. This isolates it from changes such as - # transformation matricies. - stream = ContentStream(contents, pdf) - stream.operations.insert(0, [[], "q"]) - stream.operations.append([[], "Q"]) - return stream - - @staticmethod - def _addTransformationMatrix(contents, pdf, ctm): - # adds transformation matrix at the beginning of the given - # contents stream. - a, b, c, d, e, f = ctm - contents = ContentStream(contents, pdf) - contents.operations.insert( - 0, - [ - [ - FloatObject(a), - FloatObject(b), - FloatObject(c), - FloatObject(d), - FloatObject(e), - FloatObject(f), - ], - " cm", - ], - ) - return contents - - def getContents(self): - """ - Accesses the page contents. - - :return: the ``/Contents`` object, or ``None`` if it doesn't exist. - ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 - """ - if "/Contents" in self: - return self["/Contents"].getObject() - else: - return None - - def mergePage(self, page2): - """ - Merges the content streams of two pages into one. Resource references - (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc - of this page are not altered. The parameter page's content stream will - be added to the end of this page's content stream, meaning that it will - be drawn after, or "on top" of this page. - - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - """ - self._mergePage(page2) - - def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): - # First we work on merging the resource dictionaries. This allows us - # to find out what symbols in the content streams we might need to - # rename. - - new_resources = DictionaryObject() - rename = {} - original_resources = self[PG.RESOURCES].getObject() - page2resources = page2[PG.RESOURCES].getObject() - new_annots = ArrayObject() - - for page in (self, page2): - if PG.ANNOTS in page: - annots = page[PG.ANNOTS] - if isinstance(annots, ArrayObject): - for ref in annots: - new_annots.append(ref) - - for res in ( - "/ExtGState", - RES.FONT, - RES.XOBJECT, - RES.COLOR_SPACE, - "/Pattern", - "/Shading", - "/Properties", - ): - new, newrename = PageObject._mergeResources( - original_resources, page2resources, res - ) - if new: - new_resources[NameObject(res)] = new - rename.update(newrename) - - # Combine /ProcSet sets. - new_resources[NameObject(RES.PROCSET)] = ArrayObject( - frozenset( - original_resources.get(RES.PROCSET, ArrayObject()).getObject() - ).union( - frozenset(page2resources.get(RES.PROCSET, ArrayObject()).getObject()) - ) - ) - - new_content_array = ArrayObject() - - original_content = self.getContents() - if original_content is not None: - new_content_array.append(PageObject._pushPopGS(original_content, self.pdf)) - - page2content = page2.getContents() - if page2content is not None: - page2content = ContentStream(page2content, self.pdf) - page2content.operations.insert( - 0, - [ - map( - FloatObject, - [ - page2.trimBox.getLowerLeft_x(), - page2.trimBox.getLowerLeft_y(), - page2.trimBox.getWidth(), - page2.trimBox.getHeight(), - ], - ), - "re", - ], - ) - page2content.operations.insert(1, [[], "W"]) - page2content.operations.insert(2, [[], "n"]) - if page2transformation is not None: - page2content = page2transformation(page2content) - page2content = PageObject._contentStreamRename( - page2content, rename, self.pdf - ) - page2content = PageObject._pushPopGS(page2content, self.pdf) - new_content_array.append(page2content) - - # if expanding the page to fit a new page, calculate the new media box size - if expand: - corners1 = [ - self.mediaBox.getLowerLeft_x().as_numeric(), - self.mediaBox.getLowerLeft_y().as_numeric(), - self.mediaBox.getUpperRight_x().as_numeric(), - self.mediaBox.getUpperRight_y().as_numeric(), - ] - corners2 = [ - page2.mediaBox.getLowerLeft_x().as_numeric(), - page2.mediaBox.getLowerLeft_y().as_numeric(), - page2.mediaBox.getUpperLeft_x().as_numeric(), - page2.mediaBox.getUpperLeft_y().as_numeric(), - page2.mediaBox.getUpperRight_x().as_numeric(), - page2.mediaBox.getUpperRight_y().as_numeric(), - page2.mediaBox.getLowerRight_x().as_numeric(), - page2.mediaBox.getLowerRight_y().as_numeric(), - ] - if ctm is not None: - ctm = [float(x) for x in ctm] - new_x = [ - ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] - for i in range(0, 8, 2) - ] - new_y = [ - ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] - for i in range(0, 8, 2) - ] - else: - new_x = corners2[0:8:2] - new_y = corners2[1:8:2] - lowerleft = [min(new_x), min(new_y)] - upperright = [max(new_x), max(new_y)] - lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])] - upperright = [ - max(corners1[2], upperright[0]), - max(corners1[3], upperright[1]), - ] - - self.mediaBox.setLowerLeft(lowerleft) - self.mediaBox.setUpperRight(upperright) - - self[NameObject("/Contents")] = ContentStream(new_content_array, self.pdf) - self[NameObject(PG.RESOURCES)] = new_resources - self[NameObject(PG.ANNOTS)] = new_annots - - def mergeTransformedPage(self, page2, ctm, expand=False): - """ - This is similar to mergePage, but a transformation matrix is - applied to the merged stream. - - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param tuple ctm: a 6-element tuple containing the operands of the - transformation matrix - :param bool expand: Whether the page should be expanded to fit the dimensions - of the page to be merged. - """ - self._mergePage( - page2, - lambda page2Content: PageObject._addTransformationMatrix( - page2Content, page2.pdf, ctm - ), - ctm, - expand, - ) - - def mergeScaledPage(self, page2, scale, expand=False): - """ - This is similar to mergePage, but the stream to be merged is scaled - by appling a transformation matrix. - - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - """ - # CTM to scale : [ sx 0 0 sy 0 0 ] - return self.mergeTransformedPage(page2, [scale, 0, 0, scale, 0, 0], expand) - - def mergeRotatedPage(self, page2, rotation, expand=False): - """ - This is similar to mergePage, but the stream to be merged is rotated - by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float rotation: The angle of the rotation, in degrees - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - """ - rotation = math.radians(rotation) - return self.mergeTransformedPage( - page2, - [ - math.cos(rotation), - math.sin(rotation), - -math.sin(rotation), - math.cos(rotation), - 0, - 0, - ], - expand, - ) - - def mergeTranslatedPage(self, page2, tx, ty, expand=False): - """ - This is similar to mergePage, but the stream to be merged is translated - by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - """ - return self.mergeTransformedPage(page2, [1, 0, 0, 1, tx, ty], expand) - - def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): - """ - This is similar to mergePage, but the stream to be merged is rotated - and translated by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param float rotation: The angle of the rotation, in degrees - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - """ - - translation = [[1, 0, 0], [0, 1, 0], [-tx, -ty, 1]] - rotation = math.radians(rotation) - rotating = [ - [math.cos(rotation), math.sin(rotation), 0], - [-math.sin(rotation), math.cos(rotation), 0], - [0, 0, 1], - ] - rtranslation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] - ctm = utils.matrixMultiply(translation, rotating) - ctm = utils.matrixMultiply(ctm, rtranslation) - - return self.mergeTransformedPage( - page2, - [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], - expand, - ) - - def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): - """ - This is similar to mergePage, but the stream to be merged is rotated - and scaled by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float rotation: The angle of the rotation, in degrees - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - """ - rotation = math.radians(rotation) - rotating = [ - [math.cos(rotation), math.sin(rotation), 0], - [-math.sin(rotation), math.cos(rotation), 0], - [0, 0, 1], - ] - scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] - ctm = utils.matrixMultiply(rotating, scaling) - - return self.mergeTransformedPage( - page2, - [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], - expand, - ) - - def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): - """ - This is similar to mergePage, but the stream to be merged is translated - and scaled by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float scale: The scaling factor - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - """ - - translation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] - scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] - ctm = utils.matrixMultiply(scaling, translation) - - return self.mergeTransformedPage( - page2, - [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], - expand, - ) - - def mergeRotatedScaledTranslatedPage( - self, page2, rotation, scale, tx, ty, expand=False - ): - """ - This is similar to mergePage, but the stream to be merged is translated, - rotated and scaled by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param float rotation: The angle of the rotation, in degrees - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - """ - translation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] - rotation = math.radians(rotation) - rotating = [ - [math.cos(rotation), math.sin(rotation), 0], - [-math.sin(rotation), math.cos(rotation), 0], - [0, 0, 1], - ] - scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] - ctm = utils.matrixMultiply(rotating, scaling) - ctm = utils.matrixMultiply(ctm, translation) - - return self.mergeTransformedPage( - page2, - [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], - expand, - ) - - def addTransformation(self, ctm): - """ - Applies a transformation matrix to the page. - - :param tuple ctm: A 6-element tuple containing the operands of the - transformation matrix. - """ - original_content = self.getContents() - if original_content is not None: - new_content = PageObject._addTransformationMatrix( - original_content, self.pdf, ctm - ) - new_content = PageObject._pushPopGS(new_content, self.pdf) - self[NameObject("/Contents")] = new_content - - def scale(self, sx, sy): - """ - Scales a page by the given factors by appling a transformation - matrix to its content and updating the page size. - - :param float sx: The scaling factor on horizontal axis. - :param float sy: The scaling factor on vertical axis. - """ - self.addTransformation([sx, 0, 0, sy, 0, 0]) - self.mediaBox = RectangleObject( - [ - float(self.mediaBox.getLowerLeft_x()) * sx, - float(self.mediaBox.getLowerLeft_y()) * sy, - float(self.mediaBox.getUpperRight_x()) * sx, - float(self.mediaBox.getUpperRight_y()) * sy, - ] - ) - if "/VP" in self: - viewport = self["/VP"] - if isinstance(viewport, ArrayObject): - bbox = viewport[0]["/BBox"] - else: - bbox = viewport["/BBox"] - scaled_bbox = RectangleObject( - [ - float(bbox[0]) * sx, - float(bbox[1]) * sy, - float(bbox[2]) * sx, - float(bbox[3]) * sy, - ] - ) - if isinstance(viewport, ArrayObject): - self[NameObject("/VP")][NumberObject(0)][ - NameObject("/BBox") - ] = scaled_bbox - else: - self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox - - def scaleBy(self, factor): - """ - Scales a page by the given factor by appling a transformation - matrix to its content and updating the page size. - - :param float factor: The scaling factor (for both X and Y axis). - """ - self.scale(factor, factor) - - def scaleTo(self, width, height): - """ - Scales a page to the specified dimentions by appling a - transformation matrix to its content and updating the page size. - - :param float width: The new width. - :param float height: The new heigth. - """ - sx = width / float( - self.mediaBox.getUpperRight_x() - self.mediaBox.getLowerLeft_x() - ) - sy = height / float( - self.mediaBox.getUpperRight_y() - self.mediaBox.getLowerLeft_y() - ) - self.scale(sx, sy) - - def compressContentStreams(self): - """ - Compresses the size of this page by joining all content streams and - applying a FlateDecode filter. - - However, it is possible that this function will perform no action if - content stream compression becomes "automatic" for some reason. - """ - content = self.getContents() - if content is not None: - if not isinstance(content, ContentStream): - content = ContentStream(content, self.pdf) - self[NameObject("/Contents")] = content.flateEncode() - - def extractText(self, Tj_sep="", TJ_sep=""): - """ - Locate all text drawing commands, in the order they are provided in the - content stream, and extract the text. This works well for some PDF - files, but poorly for others, depending on the generator used. This will - be refined in the future. Do not rely on the order of text coming out of - this function, as it will change if this function is made more - sophisticated. - - :return: a unicode string object. - """ - text = u_("") - content = self["/Contents"].getObject() - if not isinstance(content, ContentStream): - content = ContentStream(content, self.pdf) - # Note: we check all strings are TextStringObjects. ByteStringObjects - # are strings where the byte->string encoding was unknown, so adding - # them to the text here would be gibberish. - for operands, operator in content.operations: - if operator == b_("Tj"): - _text = operands[0] - if isinstance(_text, TextStringObject): - text += Tj_sep - text += _text - text += "\n" - elif operator == b_("T*"): - text += "\n" - elif operator == b_("'"): - text += "\n" - _text = operands[0] - if isinstance(_text, TextStringObject): - text += operands[0] - elif operator == b_('"'): - _text = operands[2] - if isinstance(_text, TextStringObject): - text += "\n" - text += _text - elif operator == b_("TJ"): - for i in operands[0]: - if isinstance(i, TextStringObject): - text += TJ_sep - text += i - elif isinstance(i, NumberObject): - # a positive value decreases and the negative value increases - # space - if int(i) < 0: - if len(text) == 0 or text[-1] != " ": - text += " " - else: - if len(text) > 1 and text[-1] == " ": - text = text[:-1] - text += "\n" - return text - - mediaBox = createRectangleAccessor(PG.MEDIABOX, ()) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the boundaries of the physical medium on which the page is - intended to be displayed or printed. - """ - - cropBox = createRectangleAccessor("/CropBox", (PG.MEDIABOX,)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the visible region of default user space. When the page is - displayed or printed, its contents are to be clipped (cropped) to this - rectangle and then imposed on the output medium in some - implementation-defined manner. Default value: same as :attr:`mediaBox`. - """ - - bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the region to which the contents of the page should be clipped - when output in a production enviroment. - """ - - trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the intended dimensions of the finished page after trimming. - """ - - artBox = createRectangleAccessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the extent of the page's meaningful content as intended by the - page's creator. - """ - - -class ContentStream(DecodedStreamObject): - def __init__(self, stream, pdf): - self.pdf = pdf - self.operations = [] - # stream may be a StreamObject or an ArrayObject containing - # multiple StreamObjects to be cat'd together. - stream = stream.getObject() - if isinstance(stream, ArrayObject): - data = b_("") - for s in stream: - data += b_(s.getObject().getData()) - stream = BytesIO(b_(data)) - else: - stream = BytesIO(b_(stream.getData())) - self.__parseContentStream(stream) - - def __parseContentStream(self, stream): - # file("f:\\tmp.txt", "w").write(stream.read()) - stream.seek(0, 0) - operands = [] - while True: - peek = readNonWhitespace(stream) - if peek == b_("") or ord_(peek) == 0: - break - stream.seek(-1, 1) - if peek.isalpha() or peek == b_("'") or peek == b_('"'): - operator = utils.readUntilRegex( - stream, NameObject.delimiterPattern, True - ) - if operator == b_("BI"): - # begin inline image - a completely different parsing - # mechanism is required, of course... thanks buddy... - assert operands == [] - ii = self._readInlineImage(stream) - self.operations.append((ii, b_("INLINE IMAGE"))) - else: - self.operations.append((operands, operator)) - operands = [] - elif peek == b_("%"): - # If we encounter a comment in the content stream, we have to - # handle it here. Typically, readObject will handle - # encountering a comment -- but readObject assumes that - # following the comment must be the object we're trying to - # read. In this case, it could be an operator instead. - while peek not in (b_("\r"), b_("\n")): - peek = stream.read(1) - else: - operands.append(readObject(stream, None)) - - def _readInlineImage(self, stream): - # begin reading just after the "BI" - begin image - # first read the dictionary of settings. - settings = DictionaryObject() - while True: - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - if tok == b_("I"): - # "ID" - begin of image data - break - key = readObject(stream, self.pdf) - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - value = readObject(stream, self.pdf) - settings[key] = value - # left at beginning of ID - tmp = stream.read(3) - assert tmp[:2] == b_("ID") - data = BytesIO() - # Read the inline image, while checking for EI (End Image) operator. - while True: - # Read 8 kB at a time and check if the chunk contains the E operator. - buf = stream.read(8192) - # We have reached the end of the stream, but haven't found the EI operator. - if not buf: - raise PdfReadError("Unexpected end of stream") - loc = buf.find(b_("E")) - - if loc == -1: - data.write(buf) - else: - # Write out everything before the E. - data.write(buf[0:loc]) - - # Seek back in the stream to read the E next. - stream.seek(loc - len(buf), 1) - tok = stream.read(1) - # Check for End Image - tok2 = stream.read(1) - if tok2 == b_("I"): - # Data can contain EI, so check for the Q operator. - tok3 = stream.read(1) - info = tok + tok2 - # We need to find whitespace between EI and Q. - has_q_whitespace = False - while tok3 in utils.WHITESPACES: - has_q_whitespace = True - info += tok3 - tok3 = stream.read(1) - if tok3 == b_("Q") and has_q_whitespace: - stream.seek(-1, 1) - break - else: - stream.seek(-1, 1) - data.write(info) - else: - stream.seek(-1, 1) - data.write(tok) - return {"settings": settings, "data": data.getvalue()} - - def _getData(self): - newdata = BytesIO() - for operands, operator in self.operations: - if operator == b_("INLINE IMAGE"): - newdata.write(b_("BI")) - dicttext = BytesIO() - operands["settings"].writeToStream(dicttext, None) - newdata.write(dicttext.getvalue()[2:-2]) - newdata.write(b_("ID ")) - newdata.write(operands["data"]) - newdata.write(b_("EI")) - else: - for op in operands: - op.writeToStream(newdata, None) - newdata.write(b_(" ")) - newdata.write(b_(operator)) - newdata.write(b_("\n")) - return newdata.getvalue() - - def _setData(self, value): - self.__parseContentStream(BytesIO(b_(value))) - - _data = property(_getData, _setData) - - -class DocumentInformation(DictionaryObject): - """ - A class representing the basic document metadata provided in a PDF File. - This class is accessible through - :meth:`.getDocumentInfo()` - - All text properties of the document metadata have - *two* properties, eg. author and author_raw. The non-raw property will - always return a ``TextStringObject``, making it ideal for a case where - the metadata is being displayed. The raw property can sometimes return - a ``ByteStringObject``, if PyPDF2 was unable to decode the string's - text encoding; this requires additional safety in the caller and - therefore is not as commonly accessed. - """ - - def __init__(self): - DictionaryObject.__init__(self) - - def getText(self, key): - retval = self.get(key, None) - if isinstance(retval, TextStringObject): - return retval - return None - - @property - def title(self): - """Read-only property accessing the document's **title**. - Returns a unicode string (``TextStringObject``) or ``None`` - if the title is not specified.""" - return ( - self.getText("/Title") or self.get("/Title").getObject() - if self.get("/Title") - else None - ) - - @property - def title_raw(self): - """The "raw" version of title; can return a ``ByteStringObject``.""" - return self.get("/Title") - - @property - def author(self): - """Read-only property accessing the document's **author**. - Returns a unicode string (``TextStringObject``) or ``None`` - if the author is not specified.""" - return self.getText("/Author") - - @property - def author_raw(self): - """The "raw" version of author; can return a ``ByteStringObject``.""" - return self.get("/Author") - - @property - def subject(self): - """Read-only property accessing the document's **subject**. - Returns a unicode string (``TextStringObject``) or ``None`` - if the subject is not specified.""" - return self.getText("/Subject") - - @property - def subject_raw(self): - """The "raw" version of subject; can return a ``ByteStringObject``.""" - return self.get("/Subject") - - @property - def creator(self): - """Read-only property accessing the document's **creator**. If the - document was converted to PDF from another format, this is the name of the - application (e.g. OpenOffice) that created the original document from - which it was converted. Returns a unicode string (``TextStringObject``) - or ``None`` if the creator is not specified.""" - return self.getText("/Creator") - - @property - def creator_raw(self): - """The "raw" version of creator; can return a ``ByteStringObject``.""" - return self.get("/Creator") - - @property - def producer(self): - """Read-only property accessing the document's **producer**. - If the document was converted to PDF from another format, this is - the name of the application (for example, OSX Quartz) that converted - it to PDF. Returns a unicode string (``TextStringObject``) - or ``None`` if the producer is not specified.""" - return self.getText("/Producer") - - @property - def producer_raw(self): - """The "raw" version of producer; can return a ``ByteStringObject``.""" - return self.get("/Producer") - - -def convertToInt(d, size): - if size > 8: - raise PdfReadError("invalid size in convertToInt") - d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) - d = d[-8:] - return struct.unpack(">q", d)[0] - - -# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 -_encryption_padding = ( - b_("\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56") - + b_("\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c") - + b_("\xa9\xfe\x64\x53\x69\x7a") -) - - -# Implementation of algorithm 3.2 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg32( - password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True -): - # 1. Pad or truncate the password string to exactly 32 bytes. If the - # password string is more than 32 bytes long, use only its first 32 bytes; - # if it is less than 32 bytes long, pad it by appending the required number - # of additional bytes from the beginning of the padding string - # (_encryption_padding). - password = b_((str_(password) + str_(_encryption_padding))[:32]) - # 2. Initialize the MD5 hash function and pass the result of step 1 as - # input to this function. - m = md5(password) - # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash - # function. - m.update(owner_entry.original_bytes) - # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass - # these bytes to the MD5 hash function, low-order byte first. - p_entry = struct.pack("= 3 and not metadata_encrypt: - m.update(b_("\xff\xff\xff\xff")) - # 7. Finish the hash. - md5_hash = m.digest() - # 8. (Revision 3 or greater) Do the following 50 times: Take the output - # from the previous MD5 hash and pass the first n bytes of the output as - # input into a new MD5 hash, where n is the number of bytes of the - # encryption key as defined by the value of the encryption dictionary's - # /Length entry. - if rev >= 3: - for _ in range(50): - md5_hash = md5(md5_hash[:keylen]).digest() - # 9. Set the encryption key to the first n bytes of the output from the - # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or - # greater, depends on the value of the encryption dictionary's /Length - # entry. - return md5_hash[:keylen] - - -# Implementation of algorithm 3.3 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg33(owner_pwd, user_pwd, rev, keylen): - # steps 1 - 4 - key = _alg33_1(owner_pwd, rev, keylen) - # 5. Pad or truncate the user password string as described in step 1 of - # algorithm 3.2. - user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) - # 6. Encrypt the result of step 5, using an RC4 encryption function with - # the encryption key obtained in step 4. - val = utils.RC4_encrypt(key, user_pwd) - # 7. (Revision 3 or greater) Do the following 19 times: Take the output - # from the previous invocation of the RC4 function and pass it as input to - # a new invocation of the function; use an encryption key generated by - # taking each byte of the encryption key obtained in step 4 and performing - # an XOR operation between that byte and the single-byte value of the - # iteration counter (from 1 to 19). - if rev >= 3: - for i in range(1, 20): - new_key = "" - for l in range(len(key)): - new_key += chr(ord_(key[l]) ^ i) - val = utils.RC4_encrypt(new_key, val) - # 8. Store the output from the final invocation of the RC4 as the value of - # the /O entry in the encryption dictionary. - return val - - -# Steps 1-4 of algorithm 3.3 -def _alg33_1(password, rev, keylen): - # 1. Pad or truncate the owner password string as described in step 1 of - # algorithm 3.2. If there is no owner password, use the user password - # instead. - password = b_((password + str_(_encryption_padding))[:32]) - # 2. Initialize the MD5 hash function and pass the result of step 1 as - # input to this function. - m = md5(password) - # 3. (Revision 3 or greater) Do the following 50 times: Take the output - # from the previous MD5 hash and pass it as input into a new MD5 hash. - md5_hash = m.digest() - if rev >= 3: - for _ in range(50): - md5_hash = md5(md5_hash).digest() - # 4. Create an RC4 encryption key using the first n bytes of the output - # from the final MD5 hash, where n is always 5 for revision 2 but, for - # revision 3 or greater, depends on the value of the encryption - # dictionary's /Length entry. - key = md5_hash[:keylen] - return key - - -# Implementation of algorithm 3.4 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg34(password, owner_entry, p_entry, id1_entry): - # 1. Create an encryption key based on the user password string, as - # described in algorithm 3.2. - key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) - # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, - # using an RC4 encryption function with the encryption key from the - # preceding step. - U = utils.RC4_encrypt(key, _encryption_padding) - # 3. Store the result of step 2 as the value of the /U entry in the - # encryption dictionary. - return U, key - - -# Implementation of algorithm 3.4 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): - # 1. Create an encryption key based on the user password string, as - # described in Algorithm 3.2. - key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) - # 2. Initialize the MD5 hash function and pass the 32-byte padding string - # shown in step 1 of Algorithm 3.2 as input to this function. - m = md5() - m.update(_encryption_padding) - # 3. Pass the first element of the file's file identifier array (the value - # of the ID entry in the document's trailer dictionary; see Table 3.13 on - # page 73) to the hash function and finish the hash. (See implementation - # note 25 in Appendix H.) - m.update(id1_entry.original_bytes) - md5_hash = m.digest() - # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption - # function with the encryption key from step 1. - val = utils.RC4_encrypt(key, md5_hash) - # 5. Do the following 19 times: Take the output from the previous - # invocation of the RC4 function and pass it as input to a new invocation - # of the function; use an encryption key generated by taking each byte of - # the original encryption key (obtained in step 2) and performing an XOR - # operation between that byte and the single-byte value of the iteration - # counter (from 1 to 19). - for i in range(1, 20): - new_key = b_("") - for k in key: - new_key += b_(chr(ord_(k) ^ i)) - val = utils.RC4_encrypt(new_key, val) - # 6. Append 16 bytes of arbitrary padding to the output from the final - # invocation of the RC4 function and store the 32-byte result as the value - # of the U entry in the encryption dictionary. - # (implementator note: I don't know what "arbitrary padding" is supposed to - # mean, so I have used null bytes. This seems to match a few other - # people's implementations) - return val + (b_("\x00") * 16), key diff --git a/Tests/test_basic_features.py b/Tests/test_basic_features.py index 7504772bb..6931aacf3 100644 --- a/Tests/test_basic_features.py +++ b/Tests/test_basic_features.py @@ -3,8 +3,8 @@ import pytest from PyPDF2 import PdfFileReader, PdfFileWriter +from PyPDF2._reader import convertToInt from PyPDF2.errors import PdfReadError -from PyPDF2.pdf import convertToInt TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) diff --git a/Tests/test_page.py b/Tests/test_page.py index 8c4edcd14..d547ea91f 100644 --- a/Tests/test_page.py +++ b/Tests/test_page.py @@ -104,6 +104,9 @@ def test_page_properties(): assert page.trimBox == RectangleObject([0, 0, 612, 792]) assert page.artBox == RectangleObject([0, 0, 612, 792]) + page.bleedBox = RectangleObject([0, 1, 100, 101]) + assert page.bleedBox == RectangleObject([0, 1, 100, 101]) + def test_page_rotation_non90(): reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) diff --git a/Tests/test_workflows.py b/Tests/test_workflows.py index f607d1059..05a839609 100644 --- a/Tests/test_workflows.py +++ b/Tests/test_workflows.py @@ -75,7 +75,7 @@ def test_decrypt(): reader = PdfFileReader(inputfile) assert reader.isEncrypted == True reader.decrypt("openpassword") - assert reader.getNumPages() == 1 + assert reader.numPages == 1 assert reader.isEncrypted == True metadict = reader.getDocumentInfo() assert dict(metadict) == { From b4f43a83ab518b8f1249fee88ea4e38c361bae0d Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 28 Apr 2022 13:38:30 +0200 Subject: [PATCH 2/2] Copyright notices --- PyPDF2/_page.py | 31 +++++++++++++++++++++++++++++++ PyPDF2/_reader.py | 31 +++++++++++++++++++++++++++++++ PyPDF2/_security.py | 31 +++++++++++++++++++++++++++++++ PyPDF2/_writer.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 0be1528a3..7bd158353 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1,3 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + import math import uuid diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index d73977039..b3c50f387 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1,3 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + import struct import sys import warnings diff --git a/PyPDF2/_security.py b/PyPDF2/_security.py index d493b4991..ae83b938b 100644 --- a/PyPDF2/_security.py +++ b/PyPDF2/_security.py @@ -1,3 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + """Anything related to encryption / decryption.""" import struct diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index c9f9aec63..d5d0a6f45 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -1,3 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + import codecs import logging import struct