Skip to content

Commit

Permalink
MAINT: Use grouped constants instead of string literals (#745)
Browse files Browse the repository at this point in the history
This allows us to leverage the IDE.

* Documentation: We can now document what the constants are good for and give background information around them
* Homographs: We can distinguish literals which have the same name, but different contexts
* Typos: We can hopefully avoid typos like decodeParams -> decodeParms.

For users of PyPDF2, this doesn't change anything. We still use string literals. For documentation we should also keep doing that.
  • Loading branch information
MartinThoma authored Apr 16, 2022
1 parent 87aafd6 commit d5a5eea
Show file tree
Hide file tree
Showing 15 changed files with 393 additions and 170 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/github-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
pip install .
- name: Test with flake8
run: |
flake8 . --ignore=E203,W503,W504,E,F403,F405
flake8 . --ignore=E203,W503,W504,E,F403,F405 --exclude build
if: matrix.python-version != '2.7'
- name: Test with pytest
run: |
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ clean:
rm -rf Tests/__pycache__ PyPDF2/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf PyPDF2.egg-info PyPDF2_pdfLocation.txt

test:
pytest Tests --cov --cov-report term-missing -vv --cov-report html
pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30

mutation-test:
mutmut run
Expand Down
4 changes: 2 additions & 2 deletions PyPDF2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .pdf import PdfFileReader, PdfFileWriter
from ._version import __version__
from .merger import PdfFileMerger
from .pagerange import PageRange, parse_filename_page_ranges
from ._version import __version__
from .pdf import PdfFileReader, PdfFileWriter

__all__ = [
"__version__",
Expand Down
186 changes: 186 additions & 0 deletions PyPDF2/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
"""
See Portable Document Format Reference Manual, 1993. ISBN 0-201-62628-4.
See https://ia802202.us.archive.org/8/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf
PDF Reference, third edition, Version 1.4, 2001. ISBN 0-201-75839-3.
PDF Reference, sixth edition, Version 1.7, 2006.
"""


class PagesAttributes:
"""Page Attributes, Table 6.2, Page 52"""

TYPE = "/Type" # name, required; must be /Pages
KIDS = "/Kids" # array, required; List of indirect references
COUNT = "/Count" # integer, required; the number of all nodes und this node
PARENT = "/Parent" # dictionary, required; indirect reference to pages object


class PageAttributes:
"""Page attributes, Table 6.3, Page 53"""

TYPE = "/Type" # name, required; must be /Page
MEDIABOX = "/MediaBox" # array, required; rectangle specifying page size
PARENT = "/Parent" # dictionary, required; a pages object
RESOURCES = "/Resources" # dictionary, required if there are any
CONTENTS = "/Contents" # stream or array, optional
CROPBOX = "/CropBox" # array, optional; rectangle
ROTATE = "/Rotate" # integer, optional; page rotation in degrees
THUMB = "/Thumb" # stream, optional; indirect reference to image of the page
ANNOTS = "/Annots" # array, optional; an array of annotations


class Ressources:
PROCSET = "/ProcSet" # Chapter 6.8.1
FONT = "/Font" # Chapter 6.8.2
# encoding
# font descriptors : 6.8.4
COLOR_SPACE = "/ColorSpace" # Chapter 6.8.5
XOBJECT = "/XObject" # Chapter 6.8.6


class StreamAttributes:
"""Table 4.2"""

LENGTH = "/Length" # integer, required
FILTER = "/Filter" # name or array of names, optional
DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong


class FilterTypes:
"""
Table 4.3 of the 1.4 Manual
Page 354 of the 1.7 Manual
"""

ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx
ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85
LZW_DECODE = "/LZWDecode" # abbreviation: LZW
FLATE_DECODE = "/FlateDecode" # abbreviation: Fl, PDF 1.2
RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
DCT_DECODE = "/DCTDecode" # abbreviation: DCT


class FilterTypeAbbreviations:
"""
Table 4.44 of the 1.7 Manual (page 353ff)
"""

AHx = "/AHx"
A85 = "/A85"
LZW = "/LZW"
FL = "/Fl" # FlateDecode
RL = "/RL"
CCF = "/CCF"
DCT = "/DCT"


class LzwFilterParameters:
"""Table 4.4"""

PREDICTOR = "/Predictor" # integer
COLUMNS = "/Columns" # integer
COLORS = "/Colors" # integer
BITS_PER_COMPONENT = "/BitsPerComponent" # integer
EARLY_CHANGE = "/EarlyChange" # integer


class CcittFaxDecodeParameters:
"""Table 4.5"""

K = "/K" # integer
END_OF_LINE = "/EndOfLine" # boolean
ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean
COLUMNS = "/Columns" # integer
ROWS = "/Rows" # integer
END_OF_BLOCK = "/EndOfBlock" # boolean
BLACK_IS_1 = "/BlackIs1" # boolean
DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer


class ImageAttributes:
"""Table 6.20."""

TYPE = "/Type" # name, required; must be /XObject
SUBTYPE = "/Subtype" # name, required; must be /Image
NAME = "/Name" # name, required
WIDTH = "/Width" # integer, required
HEIGHT = "/Height" # integer, required
BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required
COLOR_SPACE = "/ColorSpace" # name, required
DECODE = "/Decode" # array, optional
INTERPOLATE = "/Interpolate" # boolean, optional
IMAGE_MASK = "/ImageMask" # boolean, optional


class ColorSpaces:
DEVICE_RGB = "/DeviceRGB"
DEVICE_CMYK = "/DeviceCMYK"
DEVICE_GRAY = "/DeviceGray"


class TypArguments:
"""Table 8.2 of the PDF 1.7 reference"""

LEFT = "/Left"
RIGHT = "/Right"
BOTTOM = "/Bottom"
TOP = "/Top"


class TypFitArguments:
"""Table 8.2 of the PDF 1.7 reference"""

FIT = "/Fit"
FIT_V = "/FitV"
FIT_BV = "/FitBV"
FIT_B = "/FitB"
FIT_H = "/FitH"
FIT_BH = "/FitBH"
FIT_R = "/FitR"


class PageLayouts:
"""Page 84, PDF 1.4 reference"""

SINGLE_PAGE = "/SinglePage"
ONE_COLUMN = "/OneColumn"
TWO_COLUMN_LEFT = "/TwoColumnLeft"
TWO_COLUMN_RIGHT = "/TwoColumnRight"


class GraphicsStateParameters:
"""Table 4.8 of the 1.7 reference"""

TYPE = "/Type" # name, optional
LW = "/LW" # number, optional
# TODO: Many more!
FONT = "/Font" # array, optional
S_MASK = "/SMask" # dictionary or name, optional


class CatalogDictionary:
"""Table 3.25 in the 1.7 reference"""

TYPE = "/Type" # name, required; must be /Catalog
# TODO: Many more!


PDF_KEYS = [
PagesAttributes,
PageAttributes,
Ressources,
ImageAttributes,
StreamAttributes,
FilterTypes,
LzwFilterParameters,
TypArguments,
TypFitArguments,
PageLayouts,
GraphicsStateParameters,
CatalogDictionary,
]
69 changes: 41 additions & 28 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,23 @@
__author_email__ = "[email protected]"

import math
from sys import version_info

from PyPDF2.constants import CcittFaxDecodeParameters as CCITT
from PyPDF2.constants import ColorSpaces
from PyPDF2.constants import FilterTypeAbbreviations as FTA
from PyPDF2.constants import FilterTypes as FT
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import LzwFilterParameters as LZW
from PyPDF2.constants import StreamAttributes as SA

from .utils import PdfReadError, ord_, paethPredictor
from sys import version_info

if version_info < ( 3, 0 ):
from cStringIO import StringIO
else:
from io import StringIO

import struct

try:
Expand Down Expand Up @@ -110,13 +120,13 @@ def decode(data, decodeParms):
predictor = 1
if decodeParms:
try:
predictor = decodeParms.get("/Predictor", 1)
predictor = decodeParms.get(LZW.PREDICTOR, 1)
except AttributeError:
pass # usually an array with a null object was read

# predictor 1 == no predictor
if predictor != 1:
columns = decodeParms["/Columns"]
columns = decodeParms[LZW.COLUMNS]
# PNG prediction:
if predictor >= 10 and predictor <= 15:
output = StringIO()
Expand Down Expand Up @@ -261,7 +271,7 @@ def decode(self):
return baos

@staticmethod
def decode(data,decodeParams=None):
def decode(data, decodeParms=None):
return LZWDecode.decoder(data).decode()


Expand Down Expand Up @@ -363,7 +373,7 @@ def decode(data, decodeParms=None, height=0):
else:
CCITTgroup = 3

width = decodeParms["/Columns"]
width = decodeParms[CCITT.COLUMNS]
imgSize = len(data)
tiff_header_struct = '<2shlh' + 'hhll' * 8 + 'h'
tiffHeader = struct.pack(tiff_header_struct,
Expand All @@ -388,7 +398,7 @@ def decode(data, decodeParms=None, height=0):

def decodeStreamData(stream):
from .generic import NameObject
filters = stream.get("/Filter", ())
filters = stream.get(SA.FILTER, ())

if len(filters) and not isinstance(filters[0], NameObject):
# we have a single filter instance
Expand All @@ -397,24 +407,24 @@ def decodeStreamData(stream):
# If there is not data to decode we should not try to decode the data.
if data:
for filterType in filters:
if filterType == "/FlateDecode" or filterType == "/Fl":
data = FlateDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCIIHexDecode" or filterType == "/AHx":
if filterType == FT.FLATE_DECODE or filterType == FTA.FL:
data = FlateDecode.decode(data, stream.get(SA.DECODE_PARMS))
elif filterType == FT.ASCII_HEX_DECODE or filterType == FTA.AHx:
data = ASCIIHexDecode.decode(data)
elif filterType == "/LZWDecode" or filterType == "/LZW":
data = LZWDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCII85Decode" or filterType == "/A85":
elif filterType == FT.LZW_DECODE or filterType == FTA.LZW:
data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS))
elif filterType == FT.ASCII_85_DECODE or filterType == FTA.A85:
data = ASCII85Decode.decode(data)
elif filterType == "/DCTDecode":
elif filterType == FT.DCT_DECODE:
data = DCTDecode.decode(data)
elif filterType == "/JPXDecode":
data = JPXDecode.decode(data)
elif filterType == "/CCITTFaxDecode":
height = stream.get("/Height", ())
data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
elif filterType == FT.CCITT_FAX_DECODE:
height = stream.get(IA.HEIGHT, ())
data = CCITTFaxDecode.decode(data, stream.get(SA.DECODE_PARMS), height)
elif filterType == "/Crypt":
decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:
decodeParms = stream.get(SA.DECODE_PARMS, {})
if "/Name" not in decodeParms and "/Type" not in decodeParms:
pass
else:
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
Expand All @@ -434,34 +444,37 @@ def _xobj_to_image(x_object_obj):
:return: Tuple[file extension, bytes]
"""
import io

from PIL import Image

size = (x_object_obj["/Width"], x_object_obj["/Height"])
from PyPDF2.constants import GraphicsStateParameters as G

size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
data = x_object_obj.getData()
if x_object_obj["/ColorSpace"] == "/DeviceRGB":
if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB:
mode = "RGB"
else:
mode = "P"
extension = None
if "/Filter" in x_object_obj:
if x_object_obj["/Filter"] == "/FlateDecode":
if SA.FILTER in x_object_obj:
if x_object_obj[SA.FILTER] == FT.FLATE_DECODE:
extension = ".png"
img = Image.frombytes(mode, size, data)
if "/SMask" in x_object_obj: # add alpha channel
alpha = Image.frombytes("L", size, x_object_obj["/SMask"].getData())
if G.S_MASK in x_object_obj: # add alpha channel
alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].getData())
img.putalpha(alpha)
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()
elif x_object_obj["/Filter"] in (["/LZWDecode"], ['/ASCII85Decode'], ['/CCITTFaxDecode']):
elif x_object_obj[SA.FILTER] in ([FT.LZW_DECODE], [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE]):
from PyPDF2.utils import b_
extension = ".png"
data = b_(data)
elif x_object_obj["/Filter"] == "/DCTDecode":
elif x_object_obj[SA.FILTER] == FT.DCT_DECODE:
extension = ".jpg"
elif x_object_obj["/Filter"] == "/JPXDecode":
elif x_object_obj[SA.FILTER] == "/JPXDecode":
extension = ".jp2"
elif x_object_obj["/Filter"] == "/CCITTFaxDecode":
elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE:
extension = ".tiff"
else:
extension = ".png"
Expand Down
Loading

0 comments on commit d5a5eea

Please sign in to comment.