-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
27 changed files
with
581 additions
and
239 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
""" | ||
See Portable Document Format Reference Manual, 1993. ISBN 0-201-62628-4. | ||
See https://ia802202.us.archive.org/8/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf | ||
PDF Reference, third edition, Version 1.4, 2001. ISBN 0-201-75839-3. | ||
PDF Reference, sixth edition, Version 1.7, 2006. | ||
""" | ||
|
||
|
||
class PagesAttributes: | ||
"""Page Attributes, Table 6.2, Page 52""" | ||
|
||
TYPE = "/Type" # name, required; must be /Pages | ||
KIDS = "/Kids" # array, required; List of indirect references | ||
COUNT = "/Count" # integer, required; the number of all nodes und this node | ||
PARENT = "/Parent" # dictionary, required; indirect reference to pages object | ||
|
||
|
||
class PageAttributes: | ||
"""Page attributes, Table 6.3, Page 53""" | ||
|
||
TYPE = "/Type" # name, required; must be /Page | ||
MEDIABOX = "/MediaBox" # array, required; rectangle specifying page size | ||
PARENT = "/Parent" # dictionary, required; a pages object | ||
RESOURCES = "/Resources" # dictionary, required if there are any | ||
CONTENTS = "/Contents" # stream or array, optional | ||
CROPBOX = "/CropBox" # array, optional; rectangle | ||
ROTATE = "/Rotate" # integer, optional; page rotation in degrees | ||
THUMB = "/Thumb" # stream, optional; indirect reference to image of the page | ||
ANNOTS = "/Annots" # array, optional; an array of annotations | ||
|
||
|
||
class Ressources: | ||
PROCSET = "/ProcSet" # Chapter 6.8.1 | ||
FONT = "/Font" # Chapter 6.8.2 | ||
# encoding | ||
# font descriptors : 6.8.4 | ||
COLOR_SPACE = "/ColorSpace" # Chapter 6.8.5 | ||
XOBJECT = "/XObject" # Chapter 6.8.6 | ||
|
||
|
||
class StreamAttributes: | ||
"""Table 4.2""" | ||
|
||
LENGTH = "/Length" # integer, required | ||
FILTER = "/Filter" # name or array of names, optional | ||
DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong | ||
|
||
|
||
class FilterTypes: | ||
""" | ||
Table 4.3 of the 1.4 Manual | ||
Page 354 of the 1.7 Manual | ||
""" | ||
|
||
ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx | ||
ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85 | ||
LZW_DECODE = "/LZWDecode" # abbreviation: LZW | ||
FLATE_DECODE = "/FlateDecode" # abbreviation: Fl, PDF 1.2 | ||
RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL | ||
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF | ||
DCT_DECODE = "/DCTDecode" # abbreviation: DCT | ||
|
||
|
||
class FilterTypeAbbreviations: | ||
""" | ||
Table 4.44 of the 1.7 Manual (page 353ff) | ||
""" | ||
|
||
AHx = "/AHx" | ||
A85 = "/A85" | ||
LZW = "/LZW" | ||
FL = "/Fl" # FlateDecode | ||
RL = "/RL" | ||
CCF = "/CCF" | ||
DCT = "/DCT" | ||
|
||
|
||
class LzwFilterParameters: | ||
"""Table 4.4""" | ||
|
||
PREDICTOR = "/Predictor" # integer | ||
COLUMNS = "/Columns" # integer | ||
COLORS = "/Colors" # integer | ||
BITS_PER_COMPONENT = "/BitsPerComponent" # integer | ||
EARLY_CHANGE = "/EarlyChange" # integer | ||
|
||
|
||
class CcittFaxDecodeParameters: | ||
"""Table 4.5""" | ||
|
||
K = "/K" # integer | ||
END_OF_LINE = "/EndOfLine" # boolean | ||
ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean | ||
COLUMNS = "/Columns" # integer | ||
ROWS = "/Rows" # integer | ||
END_OF_BLOCK = "/EndOfBlock" # boolean | ||
BLACK_IS_1 = "/BlackIs1" # boolean | ||
DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer | ||
|
||
|
||
class ImageAttributes: | ||
"""Table 6.20.""" | ||
|
||
TYPE = "/Type" # name, required; must be /XObject | ||
SUBTYPE = "/Subtype" # name, required; must be /Image | ||
NAME = "/Name" # name, required | ||
WIDTH = "/Width" # integer, required | ||
HEIGHT = "/Height" # integer, required | ||
BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required | ||
COLOR_SPACE = "/ColorSpace" # name, required | ||
DECODE = "/Decode" # array, optional | ||
INTERPOLATE = "/Interpolate" # boolean, optional | ||
IMAGE_MASK = "/ImageMask" # boolean, optional | ||
|
||
|
||
class ColorSpaces: | ||
DEVICE_RGB = "/DeviceRGB" | ||
DEVICE_CMYK = "/DeviceCMYK" | ||
DEVICE_GRAY = "/DeviceGray" | ||
|
||
|
||
class TypArguments: | ||
"""Table 8.2 of the PDF 1.7 reference""" | ||
|
||
LEFT = "/Left" | ||
RIGHT = "/Right" | ||
BOTTOM = "/Bottom" | ||
TOP = "/Top" | ||
|
||
|
||
class TypFitArguments: | ||
"""Table 8.2 of the PDF 1.7 reference""" | ||
|
||
FIT = "/Fit" | ||
FIT_V = "/FitV" | ||
FIT_BV = "/FitBV" | ||
FIT_B = "/FitB" | ||
FIT_H = "/FitH" | ||
FIT_BH = "/FitBH" | ||
FIT_R = "/FitR" | ||
|
||
|
||
class PageLayouts: | ||
"""Page 84, PDF 1.4 reference""" | ||
|
||
SINGLE_PAGE = "/SinglePage" | ||
ONE_COLUMN = "/OneColumn" | ||
TWO_COLUMN_LEFT = "/TwoColumnLeft" | ||
TWO_COLUMN_RIGHT = "/TwoColumnRight" | ||
|
||
|
||
class GraphicsStateParameters: | ||
"""Table 4.8 of the 1.7 reference""" | ||
|
||
TYPE = "/Type" # name, optional | ||
LW = "/LW" # number, optional | ||
# TODO: Many more! | ||
FONT = "/Font" # array, optional | ||
S_MASK = "/SMask" # dictionary or name, optional | ||
|
||
|
||
class CatalogDictionary: | ||
"""Table 3.25 in the 1.7 reference""" | ||
|
||
TYPE = "/Type" # name, required; must be /Catalog | ||
# TODO: Many more! | ||
|
||
|
||
PDF_KEYS = [ | ||
PagesAttributes, | ||
PageAttributes, | ||
Ressources, | ||
ImageAttributes, | ||
StreamAttributes, | ||
FilterTypes, | ||
LzwFilterParameters, | ||
TypArguments, | ||
TypFitArguments, | ||
PageLayouts, | ||
GraphicsStateParameters, | ||
CatalogDictionary, | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,13 +31,23 @@ | |
__author_email__ = "[email protected]" | ||
|
||
import math | ||
from sys import version_info | ||
|
||
from PyPDF2.constants import CcittFaxDecodeParameters as CCITT | ||
from PyPDF2.constants import ColorSpaces | ||
from PyPDF2.constants import FilterTypeAbbreviations as FTA | ||
from PyPDF2.constants import FilterTypes as FT | ||
from PyPDF2.constants import ImageAttributes as IA | ||
from PyPDF2.constants import LzwFilterParameters as LZW | ||
from PyPDF2.constants import StreamAttributes as SA | ||
|
||
from .utils import PdfReadError, ord_, paethPredictor | ||
from sys import version_info | ||
|
||
if version_info < ( 3, 0 ): | ||
from cStringIO import StringIO | ||
else: | ||
from io import StringIO | ||
|
||
import struct | ||
|
||
try: | ||
|
@@ -110,13 +120,13 @@ def decode(data, decodeParms): | |
predictor = 1 | ||
if decodeParms: | ||
try: | ||
predictor = decodeParms.get("/Predictor", 1) | ||
predictor = decodeParms.get(LZW.PREDICTOR, 1) | ||
except AttributeError: | ||
pass # usually an array with a null object was read | ||
|
||
# predictor 1 == no predictor | ||
if predictor != 1: | ||
columns = decodeParms["/Columns"] | ||
columns = decodeParms[LZW.COLUMNS] | ||
# PNG prediction: | ||
if predictor >= 10 and predictor <= 15: | ||
output = StringIO() | ||
|
@@ -261,7 +271,7 @@ def decode(self): | |
return baos | ||
|
||
@staticmethod | ||
def decode(data,decodeParams=None): | ||
def decode(data, decodeParms=None): | ||
return LZWDecode.decoder(data).decode() | ||
|
||
|
||
|
@@ -363,7 +373,7 @@ def decode(data, decodeParms=None, height=0): | |
else: | ||
CCITTgroup = 3 | ||
|
||
width = decodeParms["/Columns"] | ||
width = decodeParms[CCITT.COLUMNS] | ||
imgSize = len(data) | ||
tiff_header_struct = '<2shlh' + 'hhll' * 8 + 'h' | ||
tiffHeader = struct.pack(tiff_header_struct, | ||
|
@@ -388,7 +398,7 @@ def decode(data, decodeParms=None, height=0): | |
|
||
def decodeStreamData(stream): | ||
from .generic import NameObject | ||
filters = stream.get("/Filter", ()) | ||
filters = stream.get(SA.FILTER, ()) | ||
|
||
if len(filters) and not isinstance(filters[0], NameObject): | ||
# we have a single filter instance | ||
|
@@ -397,24 +407,24 @@ def decodeStreamData(stream): | |
# If there is not data to decode we should not try to decode the data. | ||
if data: | ||
for filterType in filters: | ||
if filterType == "/FlateDecode" or filterType == "/Fl": | ||
data = FlateDecode.decode(data, stream.get("/DecodeParms")) | ||
elif filterType == "/ASCIIHexDecode" or filterType == "/AHx": | ||
if filterType == FT.FLATE_DECODE or filterType == FTA.FL: | ||
data = FlateDecode.decode(data, stream.get(SA.DECODE_PARMS)) | ||
elif filterType == FT.ASCII_HEX_DECODE or filterType == FTA.AHx: | ||
data = ASCIIHexDecode.decode(data) | ||
elif filterType == "/LZWDecode" or filterType == "/LZW": | ||
data = LZWDecode.decode(data, stream.get("/DecodeParms")) | ||
elif filterType == "/ASCII85Decode" or filterType == "/A85": | ||
elif filterType == FT.LZW_DECODE or filterType == FTA.LZW: | ||
data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS)) | ||
elif filterType == FT.ASCII_85_DECODE or filterType == FTA.A85: | ||
data = ASCII85Decode.decode(data) | ||
elif filterType == "/DCTDecode": | ||
elif filterType == FT.DCT_DECODE: | ||
data = DCTDecode.decode(data) | ||
elif filterType == "/JPXDecode": | ||
data = JPXDecode.decode(data) | ||
elif filterType == "/CCITTFaxDecode": | ||
height = stream.get("/Height", ()) | ||
data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height) | ||
elif filterType == FT.CCITT_FAX_DECODE: | ||
height = stream.get(IA.HEIGHT, ()) | ||
data = CCITTFaxDecode.decode(data, stream.get(SA.DECODE_PARMS), height) | ||
elif filterType == "/Crypt": | ||
decodeParams = stream.get("/DecodeParams", {}) | ||
if "/Name" not in decodeParams and "/Type" not in decodeParams: | ||
decodeParms = stream.get(SA.DECODE_PARMS, {}) | ||
if "/Name" not in decodeParms and "/Type" not in decodeParms: | ||
pass | ||
else: | ||
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") | ||
|
@@ -434,34 +444,37 @@ def _xobj_to_image(x_object_obj): | |
:return: Tuple[file extension, bytes] | ||
""" | ||
import io | ||
|
||
from PIL import Image | ||
|
||
size = (x_object_obj["/Width"], x_object_obj["/Height"]) | ||
from PyPDF2.constants import GraphicsStateParameters as G | ||
|
||
size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT]) | ||
data = x_object_obj.getData() | ||
if x_object_obj["/ColorSpace"] == "/DeviceRGB": | ||
if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB: | ||
mode = "RGB" | ||
else: | ||
mode = "P" | ||
extension = None | ||
if "/Filter" in x_object_obj: | ||
if x_object_obj["/Filter"] == "/FlateDecode": | ||
if SA.FILTER in x_object_obj: | ||
if x_object_obj[SA.FILTER] == FT.FLATE_DECODE: | ||
extension = ".png" | ||
img = Image.frombytes(mode, size, data) | ||
if "/SMask" in x_object_obj: # add alpha channel | ||
alpha = Image.frombytes("L", size, x_object_obj["/SMask"].getData()) | ||
if G.S_MASK in x_object_obj: # add alpha channel | ||
alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].getData()) | ||
img.putalpha(alpha) | ||
img_byte_arr = io.BytesIO() | ||
img.save(img_byte_arr, format="PNG") | ||
data = img_byte_arr.getvalue() | ||
elif x_object_obj["/Filter"] in (["/LZWDecode"], ['/ASCII85Decode'], ['/CCITTFaxDecode']): | ||
elif x_object_obj[SA.FILTER] in ([FT.LZW_DECODE], [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE]): | ||
from PyPDF2.utils import b_ | ||
extension = ".png" | ||
data = b_(data) | ||
elif x_object_obj["/Filter"] == "/DCTDecode": | ||
elif x_object_obj[SA.FILTER] == FT.DCT_DECODE: | ||
extension = ".jpg" | ||
elif x_object_obj["/Filter"] == "/JPXDecode": | ||
elif x_object_obj[SA.FILTER] == "/JPXDecode": | ||
extension = ".jp2" | ||
elif x_object_obj["/Filter"] == "/CCITTFaxDecode": | ||
elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE: | ||
extension = ".tiff" | ||
else: | ||
extension = ".png" | ||
|
Oops, something went wrong.