Skip to content

Commit

Permalink
MAINT: Separated CCITTFax param parsing/decoding (#841)
Browse files Browse the repository at this point in the history
* BUG: Changed default /K to conform with the PDF 1.7 standard
* TST: Add test for CCITTFax
* TST: Add test for TextStringObject

STY:
* Group Python 2.7 imports
* camelCase variables to snake_case
* Apply black formatter
  • Loading branch information
MartinThoma authored May 1, 2022
1 parent 444fca2 commit d2ed8e5
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 48 deletions.
16 changes: 7 additions & 9 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,10 @@

if version_info < (3, 0):
from cStringIO import StringIO
else:
from io import StringIO
if version_info < (3, 0):

BytesIO = StringIO
else:
from io import BytesIO
from io import BytesIO, StringIO


def convertToInt(d, size):
Expand Down Expand Up @@ -567,7 +565,7 @@ def _getPageNumberByIndirect(self, indirectRef):
self._pageId2Num = id2num

if isinstance(indirectRef, NullObject):
return -1
return -1
if isinstance(indirectRef, int):
idnum = indirectRef
else:
Expand Down Expand Up @@ -613,10 +611,10 @@ def _buildDestination(self, title, array):
if self.strict:
raise
else:
#create a link to first Page
return Destination(title, self.getPage(0).indirectRef,
TextStringObject("/Fit"))

# create a link to first Page
return Destination(
title, self.getPage(0).indirectRef, TextStringObject("/Fit")
)

def _buildOutline(self, node):
dest, title, outline = None, None, None
Expand Down
86 changes: 60 additions & 26 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,21 +176,21 @@ def _decode_png_prediction(data, columns):
rowdata = [
ord_(x) for x in data[(row * rowlength) : ((row + 1) * rowlength)]
]
filterByte = rowdata[0]
if filterByte == 0:
filter_byte = rowdata[0]
if filter_byte == 0:
pass
elif filterByte == 1:
elif filter_byte == 1:
for i in range(2, rowlength):
rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256
elif filterByte == 2:
elif filter_byte == 2:
for i in range(1, rowlength):
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
elif filterByte == 3:
elif filter_byte == 3:
for i in range(1, rowlength):
left = rowdata[i - 1] if i > 1 else 0
floor = math.floor(left + prev_rowdata[i]) / 2
rowdata[i] = (rowdata[i] + int(floor)) % 256
elif filterByte == 4:
elif filter_byte == 4:
for i in range(1, rowlength):
left = rowdata[i - 1] if i > 1 else 0
up = prev_rowdata[i]
Expand All @@ -199,7 +199,7 @@ def _decode_png_prediction(data, columns):
rowdata[i] = (rowdata[i] + paeth) % 256
else:
# unsupported PNG filter
raise PdfReadError("Unsupported PNG filter %r" % filterByte)
raise PdfReadError("Unsupported PNG filter %r" % filter_byte)
prev_rowdata = rowdata
output.write("".join([chr(x) for x in rowdata[1:]]))
return output.getvalue()
Expand Down Expand Up @@ -438,31 +438,65 @@ def decode(data, decodeParms=None):
return data


class CCITParameters(object):
"""TABLE 3.9 Optional parameters for the CCITTFaxDecode filter"""

def __init__(self, K=0, columns=0, rows=0):
self.K = K
self.EndOfBlock = None
self.EndOfLine = None
self.EncodedByteAlign = None
self.columns = columns # width
self.rows = rows # height
self.DamagedRowsBeforeError = None

@property
def group(self):
if self.K < 0:
CCITTgroup = 4
else:
# k == 0: Pure one-dimensional encoding (Group 3, 1-D)
# k > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
CCITTgroup = 3
return CCITTgroup


class CCITTFaxDecode(object):
"""
See 3.3.5 CCITTFaxDecode Filter (PDF 1.7 Standard).
Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
CCITT encoding is bit-oriented, not byte-oriented.
See: TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
"""

@staticmethod
def decode(data, decodeParms=None, height=0):
k = 1
width = 0
if decodeParms:
def _get_parameters(parameters, rows):
k = 0
columns = 0
if parameters:
from PyPDF2.generic import ArrayObject

if isinstance(decodeParms, ArrayObject):
for decodeParm in decodeParms:
if isinstance(parameters, ArrayObject):
for decodeParm in parameters:
if CCITT.COLUMNS in decodeParm:
width = decodeParm[CCITT.COLUMNS]
columns = decodeParm[CCITT.COLUMNS]
if CCITT.K in decodeParm:
k = decodeParm[CCITT.K]
else:
width = decodeParms[CCITT.COLUMNS]
k = decodeParms[CCITT.K]
if k == -1:
CCITTgroup = 4
else:
CCITTgroup = 3
columns = parameters[CCITT.COLUMNS]
k = parameters[CCITT.K]

return CCITParameters(k, columns, rows)

@staticmethod
def decode(data, decodeParms=None, height=0):
parms = CCITTFaxDecode._get_parameters(decodeParms, height)

img_size = len(data)
tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
tiffHeader = struct.pack(
tiff_header = struct.pack(
tiff_header_struct,
b"II", # Byte order indication: Little endian
42, # Version number (always 42)
Expand All @@ -471,19 +505,19 @@ def decode(data, decodeParms=None, height=0):
256,
4,
1,
width, # ImageWidth, LONG, 1, width
parms.columns, # ImageWidth, LONG, 1, width
257,
4,
1,
height, # ImageLength, LONG, 1, length
parms.rows, # ImageLength, LONG, 1, length
258,
3,
1,
1, # BitsPerSample, SHORT, 1, 1
259,
3,
1,
CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
parms.group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262,
3,
1,
Expand All @@ -497,15 +531,15 @@ def decode(data, decodeParms=None, height=0):
278,
4,
1,
height, # RowsPerStrip, LONG, 1, length
parms.rows, # RowsPerStrip, LONG, 1, length
279,
4,
1,
img_size, # StripByteCounts, LONG, 1, size of image
0, # last IFD
)

return tiffHeader + data
return tiff_header + data


def decodeStreamData(stream):
Expand Down
6 changes: 2 additions & 4 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,10 @@

if version_info < (3, 0):
from cStringIO import StringIO
else:
from io import StringIO
if version_info < (3, 0):

BytesIO = StringIO
else:
from io import BytesIO
from io import BytesIO, StringIO

logger = logging.getLogger(__name__)
ObjectPrefix = b_("/<[tf(n%")
Expand Down
5 changes: 1 addition & 4 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,10 @@

if version_info < (3, 0):
from cStringIO import StringIO
else:
from io import StringIO

if version_info < (3, 0):
BytesIO = StringIO
else:
from io import BytesIO # noqa: F401
from io import StringIO, BytesIO # noqa: F401

import codecs # noqa: F401
import warnings # noqa: F401
Expand Down
46 changes: 45 additions & 1 deletion Tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@
import pytest

from PyPDF2.errors import PdfReadError, PdfStreamError
from PyPDF2.filters import ASCII85Decode, ASCIIHexDecode, FlateDecode
from PyPDF2.filters import (
ASCII85Decode,
ASCIIHexDecode,
CCITParameters,
CCITTFaxDecode,
FlateDecode,
)
from PyPDF2.generic import ArrayObject, DictionaryObject, NumberObject

filter_inputs = (
# "", '', """""",
Expand Down Expand Up @@ -140,3 +147,40 @@ def test_ASCII85Decode_five_zero_bytes():

for expected, i in zip(exp_outputs, inputs):
assert ASCII85Decode.decode(i) == expected


def test_CCITParameters():
parms = CCITParameters()
assert parms.K == 0 # zero is the default according to page 78
assert parms.group == 3


@pytest.mark.parametrize(
("parameters", "expected_k"),
[
(None, 0),
(ArrayObject([{"/K": 1}, {"/Columns": 13}]), 1),
],
)
def test_CCIT_get_parameters(parameters, expected_k):
parmeters = CCITTFaxDecode._get_parameters(parameters=parameters, rows=0)
assert parmeters.K == expected_k


def test_CCITTFaxDecode():
data = b""
parameters = DictionaryObject(
{"/K": NumberObject(-1), "/Columns": NumberObject(17)}
)

# This was just the result PyPDF2 1.27.9 returned.
# It would be awesome if we could check if that is actually correct.
assert CCITTFaxDecode.decode(data, parameters) == (
b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00"
b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01"
b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00"
b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00"
b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01"
b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00"
b"\x00\x00\x00\x00\x00\x00\x00\x00"
)
14 changes: 14 additions & 0 deletions Tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
NumberObject,
PdfObject,
RectangleObject,
TextStringObject,
createStringObject,
encode_pdfdocencoding,
readHexStringFromStream,
Expand Down Expand Up @@ -327,3 +328,16 @@ def test_RectangleObject():

ro.upperRight = (13, 17)
assert ro.upperRight == (13, 17)


def test_TextStringObject_exc():
tso = TextStringObject("foo")
with pytest.raises(Exception) as exc:
tso.get_original_bytes()
assert exc.value.args[0] == "no information about original bytes"


def test_TextStringObject_autodetect_utf16():
tso = TextStringObject("foo")
tso.autodetect_utf16 = True
assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"
8 changes: 5 additions & 3 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,7 @@ def test_reader_properties():
assert reader.pageMode is None
assert reader.isEncrypted is False


@pytest.mark.parametrize(
"strict",
[(True), (False)],
Expand All @@ -564,14 +565,14 @@ def test_issue604(strict):
bookmarks = pdf.getOutlines()
if "Unknown Destination" not in exc.value.args[0]:
raise Exception("Expected exception not raised")
return # bookmarks not correct
return # bookmarks not correct
else:
pdf = PdfFileReader(f, strict=strict)
bookmarks = pdf.getOutlines()

def getDestPages(x):
# print(x)
if isinstance(x,list):
if isinstance(x, list):
r = [getDestPages(y) for y in x]
return r
else:
Expand All @@ -582,7 +583,8 @@ def getDestPages(x):
b
) in bookmarks: # b can be destination or a list:preferred to just print them
out.append(getDestPages(b))
#print(out)
# print(out)


def test_decode_permissions():
reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
Expand Down
2 changes: 1 addition & 1 deletion sample-files

0 comments on commit d2ed8e5

Please sign in to comment.