Merge branch 'main' into bugs

py-pdf · Apr 16, 2022 · 73b2bba · 73b2bba
2 parents 820798f + d5a5eea
commit 73b2bba
Show file tree

Hide file tree

Showing 27 changed files with 581 additions and 239 deletions.
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -40,7 +40,7 @@ jobs:
         pip install .
     - name: Test with flake8
       run: |
-        flake8 . --ignore=E203,W503,W504,E,F403,F405
+        flake8 . --ignore=E203,W503,W504,E,F403,F405 --exclude build
       if: matrix.python-version != '2.7'
     - name: Test with pytest
       run: |

diff --git a/Makefile b/Makefile
@@ -14,7 +14,7 @@ clean:
 	rm -rf Tests/__pycache__ PyPDF2/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf PyPDF2.egg-info PyPDF2_pdfLocation.txt
 
 test:
-	pytest Tests --cov --cov-report term-missing -vv --cov-report html
+	pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
 
 mutation-test:
 	mutmut run

diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py
@@ -1,7 +1,7 @@
-from .pdf import PdfFileReader, PdfFileWriter
+from ._version import __version__
 from .merger import PdfFileMerger
 from .pagerange import PageRange, parse_filename_page_ranges
-from ._version import __version__
+from .pdf import PdfFileReader, PdfFileWriter
 
 __all__ = [
     "__version__",

diff --git a/PyPDF2/constants.py b/PyPDF2/constants.py
@@ -0,0 +1,186 @@
+"""
+See Portable Document Format Reference Manual, 1993. ISBN 0-201-62628-4.
+
+See https://ia802202.us.archive.org/8/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf
+
+PDF Reference, third edition, Version 1.4, 2001. ISBN 0-201-75839-3.
+
+PDF Reference, sixth edition, Version 1.7, 2006.
+"""
+
+
+class PagesAttributes:
+    """Page Attributes, Table 6.2, Page 52"""
+
+    TYPE = "/Type"  # name, required; must be /Pages
+    KIDS = "/Kids"  # array, required; List of indirect references
+    COUNT = "/Count"  # integer, required; the number of all nodes und this node
+    PARENT = "/Parent"  # dictionary, required; indirect reference to pages object
+
+
+class PageAttributes:
+    """Page attributes, Table 6.3, Page 53"""
+
+    TYPE = "/Type"  # name, required; must be /Page
+    MEDIABOX = "/MediaBox"  # array, required; rectangle specifying page size
+    PARENT = "/Parent"  # dictionary, required; a pages object
+    RESOURCES = "/Resources"  # dictionary, required if there are any
+    CONTENTS = "/Contents"  # stream or array, optional
+    CROPBOX = "/CropBox"  # array, optional; rectangle
+    ROTATE = "/Rotate"  # integer, optional; page rotation in degrees
+    THUMB = "/Thumb"  # stream, optional; indirect reference to image of the page
+    ANNOTS = "/Annots"  # array, optional; an array of annotations
+
+
+class Ressources:
+    PROCSET = "/ProcSet"  # Chapter 6.8.1
+    FONT = "/Font"  # Chapter 6.8.2
+    # encoding
+    # font descriptors : 6.8.4
+    COLOR_SPACE = "/ColorSpace"  # Chapter 6.8.5
+    XOBJECT = "/XObject"  # Chapter 6.8.6
+
+
+class StreamAttributes:
+    """Table 4.2"""
+
+    LENGTH = "/Length"  # integer, required
+    FILTER = "/Filter"  # name or array of names, optional
+    DECODE_PARMS = "/DecodeParms"  # variable, optional -- 'decodeParams is wrong
+
+
+class FilterTypes:
+    """
+    Table 4.3 of the 1.4 Manual
+
+    Page 354 of the 1.7 Manual
+    """
+
+    ASCII_HEX_DECODE = "/ASCIIHexDecode"  # abbreviation: AHx
+    ASCII_85_DECODE = "/ASCII85Decode"  # abbreviation: A85
+    LZW_DECODE = "/LZWDecode"  # abbreviation: LZW
+    FLATE_DECODE = "/FlateDecode"  # abbreviation: Fl, PDF 1.2
+    RUN_LENGTH_DECODE = "/RunLengthDecode"  # abbreviation: RL
+    CCITT_FAX_DECODE = "/CCITTFaxDecode"  # abbreviation: CCF
+    DCT_DECODE = "/DCTDecode"  # abbreviation: DCT
+
+
+class FilterTypeAbbreviations:
+    """
+    Table 4.44 of the 1.7 Manual (page 353ff)
+    """
+
+    AHx = "/AHx"
+    A85 = "/A85"
+    LZW = "/LZW"
+    FL = "/Fl"  # FlateDecode
+    RL = "/RL"
+    CCF = "/CCF"
+    DCT = "/DCT"
+
+
+class LzwFilterParameters:
+    """Table 4.4"""
+
+    PREDICTOR = "/Predictor"  # integer
+    COLUMNS = "/Columns"  # integer
+    COLORS = "/Colors"  # integer
+    BITS_PER_COMPONENT = "/BitsPerComponent"  # integer
+    EARLY_CHANGE = "/EarlyChange"  # integer
+
+
+class CcittFaxDecodeParameters:
+    """Table 4.5"""
+
+    K = "/K"  # integer
+    END_OF_LINE = "/EndOfLine"  # boolean
+    ENCODED_BYTE_ALIGN = "/EncodedByteAlign"  # boolean
+    COLUMNS = "/Columns"  # integer
+    ROWS = "/Rows"  # integer
+    END_OF_BLOCK = "/EndOfBlock"  # boolean
+    BLACK_IS_1 = "/BlackIs1"  # boolean
+    DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError"  # integer
+
+
+class ImageAttributes:
+    """Table 6.20."""
+
+    TYPE = "/Type"  # name, required; must be /XObject
+    SUBTYPE = "/Subtype"  # name, required; must be /Image
+    NAME = "/Name"  # name, required
+    WIDTH = "/Width"  # integer, required
+    HEIGHT = "/Height"  # integer, required
+    BITS_PER_COMPONENT = "/BitsPerComponent"  # integer, required
+    COLOR_SPACE = "/ColorSpace"  # name, required
+    DECODE = "/Decode"  # array, optional
+    INTERPOLATE = "/Interpolate"  # boolean, optional
+    IMAGE_MASK = "/ImageMask"  # boolean, optional
+
+
+class ColorSpaces:
+    DEVICE_RGB = "/DeviceRGB"
+    DEVICE_CMYK = "/DeviceCMYK"
+    DEVICE_GRAY = "/DeviceGray"
+
+
+class TypArguments:
+    """Table 8.2 of the PDF 1.7 reference"""
+
+    LEFT = "/Left"
+    RIGHT = "/Right"
+    BOTTOM = "/Bottom"
+    TOP = "/Top"
+
+
+class TypFitArguments:
+    """Table 8.2 of the PDF 1.7 reference"""
+
+    FIT = "/Fit"
+    FIT_V = "/FitV"
+    FIT_BV = "/FitBV"
+    FIT_B = "/FitB"
+    FIT_H = "/FitH"
+    FIT_BH = "/FitBH"
+    FIT_R = "/FitR"
+
+
+class PageLayouts:
+    """Page 84, PDF 1.4 reference"""
+
+    SINGLE_PAGE = "/SinglePage"
+    ONE_COLUMN = "/OneColumn"
+    TWO_COLUMN_LEFT = "/TwoColumnLeft"
+    TWO_COLUMN_RIGHT = "/TwoColumnRight"
+
+
+class GraphicsStateParameters:
+    """Table 4.8 of the 1.7 reference"""
+
+    TYPE = "/Type"  # name, optional
+    LW = "/LW"  # number, optional
+    # TODO: Many more!
+    FONT = "/Font"  # array, optional
+    S_MASK = "/SMask"  # dictionary or name, optional
+
+
+class CatalogDictionary:
+    """Table 3.25 in the 1.7 reference"""
+
+    TYPE = "/Type"  # name, required; must be /Catalog
+    # TODO: Many more!
+
+
+PDF_KEYS = [
+    PagesAttributes,
+    PageAttributes,
+    Ressources,
+    ImageAttributes,
+    StreamAttributes,
+    FilterTypes,
+    LzwFilterParameters,
+    TypArguments,
+    TypFitArguments,
+    PageLayouts,
+    GraphicsStateParameters,
+    CatalogDictionary,
+]
diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
@@ -31,13 +31,23 @@
 __author_email__ = "[email protected]"
 
 import math
+from sys import version_info
+
+from PyPDF2.constants import CcittFaxDecodeParameters as CCITT
+from PyPDF2.constants import ColorSpaces
+from PyPDF2.constants import FilterTypeAbbreviations as FTA
+from PyPDF2.constants import FilterTypes as FT
+from PyPDF2.constants import ImageAttributes as IA
+from PyPDF2.constants import LzwFilterParameters as LZW
+from PyPDF2.constants import StreamAttributes as SA
 
 from .utils import PdfReadError, ord_, paethPredictor
-from sys import version_info
+
 if version_info < ( 3, 0 ):
     from cStringIO import StringIO
 else:
     from io import StringIO
+
 import struct
 
 try:
@@ -110,13 +120,13 @@ def decode(data, decodeParms):
         predictor = 1
         if decodeParms:
             try:
-                predictor = decodeParms.get("/Predictor", 1)
+                predictor = decodeParms.get(LZW.PREDICTOR, 1)
             except AttributeError:
                 pass    # usually an array with a null object was read
 
         # predictor 1 == no predictor
         if predictor != 1:
-            columns = decodeParms["/Columns"]
+            columns = decodeParms[LZW.COLUMNS]
             # PNG prediction:
             if predictor >= 10 and predictor <= 15:
                 output = StringIO()
@@ -261,7 +271,7 @@ def decode(self):
             return baos
 
     @staticmethod
-    def decode(data,decodeParams=None):
+    def decode(data, decodeParms=None):
         return LZWDecode.decoder(data).decode()
 
 
@@ -363,7 +373,7 @@ def decode(data, decodeParms=None, height=0):
             else:
                 CCITTgroup = 3
 
-        width = decodeParms["/Columns"]
+        width = decodeParms[CCITT.COLUMNS]
         imgSize = len(data)
         tiff_header_struct = '<2shlh' + 'hhll' * 8 + 'h'
         tiffHeader = struct.pack(tiff_header_struct,
@@ -388,7 +398,7 @@ def decode(data, decodeParms=None, height=0):
 
 def decodeStreamData(stream):
     from .generic import NameObject
-    filters = stream.get("/Filter", ())
+    filters = stream.get(SA.FILTER, ())
 
     if len(filters) and not isinstance(filters[0], NameObject):
         # we have a single filter instance
@@ -397,24 +407,24 @@ def decodeStreamData(stream):
     # If there is not data to decode we should not try to decode the data.
     if data:
         for filterType in filters:
-            if filterType == "/FlateDecode" or filterType == "/Fl":
-                data = FlateDecode.decode(data, stream.get("/DecodeParms"))
-            elif filterType == "/ASCIIHexDecode" or filterType == "/AHx":
+            if filterType == FT.FLATE_DECODE or filterType == FTA.FL:
+                data = FlateDecode.decode(data, stream.get(SA.DECODE_PARMS))
+            elif filterType == FT.ASCII_HEX_DECODE or filterType == FTA.AHx:
                 data = ASCIIHexDecode.decode(data)
-            elif filterType == "/LZWDecode" or filterType == "/LZW":
-                data = LZWDecode.decode(data, stream.get("/DecodeParms"))
-            elif filterType == "/ASCII85Decode" or filterType == "/A85":
+            elif filterType == FT.LZW_DECODE or filterType == FTA.LZW:
+                data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS))
+            elif filterType == FT.ASCII_85_DECODE or filterType == FTA.A85:
                 data = ASCII85Decode.decode(data)
-            elif filterType == "/DCTDecode":
+            elif filterType == FT.DCT_DECODE:
                 data = DCTDecode.decode(data)
             elif filterType == "/JPXDecode":
                 data = JPXDecode.decode(data)
-            elif filterType == "/CCITTFaxDecode":
-                height = stream.get("/Height", ())
-                data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
+            elif filterType == FT.CCITT_FAX_DECODE:
+                height = stream.get(IA.HEIGHT, ())
+                data = CCITTFaxDecode.decode(data, stream.get(SA.DECODE_PARMS), height)
             elif filterType == "/Crypt":
-                decodeParams = stream.get("/DecodeParams", {})
-                if "/Name" not in decodeParams and "/Type" not in decodeParams:
+                decodeParms = stream.get(SA.DECODE_PARMS, {})
+                if "/Name" not in decodeParms and "/Type" not in decodeParms:
                     pass
                 else:
                     raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
@@ -434,34 +444,37 @@ def _xobj_to_image(x_object_obj):
     :return: Tuple[file extension, bytes]
     """
     import io
+
     from PIL import Image
 
-    size = (x_object_obj["/Width"], x_object_obj["/Height"])
+    from PyPDF2.constants import GraphicsStateParameters as G
+
+    size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
     data = x_object_obj.getData()
-    if x_object_obj["/ColorSpace"] == "/DeviceRGB":
+    if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB:
         mode = "RGB"
     else:
         mode = "P"
     extension = None
-    if "/Filter" in x_object_obj:
-        if x_object_obj["/Filter"] == "/FlateDecode":
+    if SA.FILTER in x_object_obj:
+        if x_object_obj[SA.FILTER] == FT.FLATE_DECODE:
             extension = ".png"
             img = Image.frombytes(mode, size, data)
-            if "/SMask" in x_object_obj:  # add alpha channel
-                alpha = Image.frombytes("L", size, x_object_obj["/SMask"].getData())
+            if G.S_MASK in x_object_obj:  # add alpha channel
+                alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].getData())
                 img.putalpha(alpha)
             img_byte_arr = io.BytesIO()
             img.save(img_byte_arr, format="PNG")
             data = img_byte_arr.getvalue()
-        elif x_object_obj["/Filter"] in (["/LZWDecode"], ['/ASCII85Decode'], ['/CCITTFaxDecode']):
+        elif x_object_obj[SA.FILTER] in ([FT.LZW_DECODE], [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE]):
             from PyPDF2.utils import b_
             extension = ".png"
             data = b_(data)
-        elif x_object_obj["/Filter"] == "/DCTDecode":
+        elif x_object_obj[SA.FILTER] == FT.DCT_DECODE:
             extension = ".jpg"
-        elif x_object_obj["/Filter"] == "/JPXDecode":
+        elif x_object_obj[SA.FILTER] == "/JPXDecode":
             extension = ".jp2"
-        elif x_object_obj["/Filter"] == "/CCITTFaxDecode":
+        elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE:
             extension = ".tiff"
     else:
         extension = ".png"