Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: Separated CCITTFax param parsing/decoding #841

Merged
merged 1 commit into from
May 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,10 @@

if version_info < (3, 0):
from cStringIO import StringIO
else:
from io import StringIO
if version_info < (3, 0):

BytesIO = StringIO
else:
from io import BytesIO
from io import BytesIO, StringIO


def convertToInt(d, size):
Expand Down Expand Up @@ -567,7 +565,7 @@ def _getPageNumberByIndirect(self, indirectRef):
self._pageId2Num = id2num

if isinstance(indirectRef, NullObject):
return -1
return -1
if isinstance(indirectRef, int):
idnum = indirectRef
else:
Expand Down Expand Up @@ -613,10 +611,10 @@ def _buildDestination(self, title, array):
if self.strict:
raise
else:
#create a link to first Page
return Destination(title, self.getPage(0).indirectRef,
TextStringObject("/Fit"))

# create a link to first Page
return Destination(
title, self.getPage(0).indirectRef, TextStringObject("/Fit")
)

def _buildOutline(self, node):
dest, title, outline = None, None, None
Expand Down
86 changes: 60 additions & 26 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,21 +176,21 @@ def _decode_png_prediction(data, columns):
rowdata = [
ord_(x) for x in data[(row * rowlength) : ((row + 1) * rowlength)]
]
filterByte = rowdata[0]
if filterByte == 0:
filter_byte = rowdata[0]
if filter_byte == 0:
pass
elif filterByte == 1:
elif filter_byte == 1:
for i in range(2, rowlength):
rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256
elif filterByte == 2:
elif filter_byte == 2:
for i in range(1, rowlength):
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
elif filterByte == 3:
elif filter_byte == 3:
for i in range(1, rowlength):
left = rowdata[i - 1] if i > 1 else 0
floor = math.floor(left + prev_rowdata[i]) / 2
rowdata[i] = (rowdata[i] + int(floor)) % 256
elif filterByte == 4:
elif filter_byte == 4:
for i in range(1, rowlength):
left = rowdata[i - 1] if i > 1 else 0
up = prev_rowdata[i]
Expand All @@ -199,7 +199,7 @@ def _decode_png_prediction(data, columns):
rowdata[i] = (rowdata[i] + paeth) % 256
else:
# unsupported PNG filter
raise PdfReadError("Unsupported PNG filter %r" % filterByte)
raise PdfReadError("Unsupported PNG filter %r" % filter_byte)
prev_rowdata = rowdata
output.write("".join([chr(x) for x in rowdata[1:]]))
return output.getvalue()
Expand Down Expand Up @@ -438,31 +438,65 @@ def decode(data, decodeParms=None):
return data


class CCITParameters(object):
"""TABLE 3.9 Optional parameters for the CCITTFaxDecode filter"""

def __init__(self, K=0, columns=0, rows=0):
self.K = K
self.EndOfBlock = None
self.EndOfLine = None
self.EncodedByteAlign = None
self.columns = columns # width
self.rows = rows # height
self.DamagedRowsBeforeError = None

@property
def group(self):
if self.K < 0:
CCITTgroup = 4
else:
# k == 0: Pure one-dimensional encoding (Group 3, 1-D)
# k > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
CCITTgroup = 3
return CCITTgroup


class CCITTFaxDecode(object):
"""
See 3.3.5 CCITTFaxDecode Filter (PDF 1.7 Standard).

Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
CCITT encoding is bit-oriented, not byte-oriented.

See: TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
"""

@staticmethod
def decode(data, decodeParms=None, height=0):
k = 1
width = 0
if decodeParms:
def _get_parameters(parameters, rows):
k = 0
columns = 0
if parameters:
from PyPDF2.generic import ArrayObject

if isinstance(decodeParms, ArrayObject):
for decodeParm in decodeParms:
if isinstance(parameters, ArrayObject):
for decodeParm in parameters:
if CCITT.COLUMNS in decodeParm:
width = decodeParm[CCITT.COLUMNS]
columns = decodeParm[CCITT.COLUMNS]
if CCITT.K in decodeParm:
k = decodeParm[CCITT.K]
else:
width = decodeParms[CCITT.COLUMNS]
k = decodeParms[CCITT.K]
if k == -1:
CCITTgroup = 4
else:
CCITTgroup = 3
columns = parameters[CCITT.COLUMNS]
k = parameters[CCITT.K]

return CCITParameters(k, columns, rows)

@staticmethod
def decode(data, decodeParms=None, height=0):
parms = CCITTFaxDecode._get_parameters(decodeParms, height)

img_size = len(data)
tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
tiffHeader = struct.pack(
tiff_header = struct.pack(
tiff_header_struct,
b"II", # Byte order indication: Little endian
42, # Version number (always 42)
Expand All @@ -471,19 +505,19 @@ def decode(data, decodeParms=None, height=0):
256,
4,
1,
width, # ImageWidth, LONG, 1, width
parms.columns, # ImageWidth, LONG, 1, width
257,
4,
1,
height, # ImageLength, LONG, 1, length
parms.rows, # ImageLength, LONG, 1, length
258,
3,
1,
1, # BitsPerSample, SHORT, 1, 1
259,
3,
1,
CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
parms.group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262,
3,
1,
Expand All @@ -497,15 +531,15 @@ def decode(data, decodeParms=None, height=0):
278,
4,
1,
height, # RowsPerStrip, LONG, 1, length
parms.rows, # RowsPerStrip, LONG, 1, length
279,
4,
1,
img_size, # StripByteCounts, LONG, 1, size of image
0, # last IFD
)

return tiffHeader + data
return tiff_header + data


def decodeStreamData(stream):
Expand Down
6 changes: 2 additions & 4 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,10 @@

if version_info < (3, 0):
from cStringIO import StringIO
else:
from io import StringIO
if version_info < (3, 0):

BytesIO = StringIO
else:
from io import BytesIO
from io import BytesIO, StringIO

logger = logging.getLogger(__name__)
ObjectPrefix = b_("/<[tf(n%")
Expand Down
5 changes: 1 addition & 4 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,10 @@

if version_info < (3, 0):
from cStringIO import StringIO
else:
from io import StringIO

if version_info < (3, 0):
BytesIO = StringIO
else:
from io import BytesIO # noqa: F401
from io import StringIO, BytesIO # noqa: F401

import codecs # noqa: F401
import warnings # noqa: F401
Expand Down
46 changes: 45 additions & 1 deletion Tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@
import pytest

from PyPDF2.errors import PdfReadError, PdfStreamError
from PyPDF2.filters import ASCII85Decode, ASCIIHexDecode, FlateDecode
from PyPDF2.filters import (
ASCII85Decode,
ASCIIHexDecode,
CCITParameters,
CCITTFaxDecode,
FlateDecode,
)
from PyPDF2.generic import ArrayObject, DictionaryObject, NumberObject

filter_inputs = (
# "", '', """""",
Expand Down Expand Up @@ -140,3 +147,40 @@ def test_ASCII85Decode_five_zero_bytes():

for expected, i in zip(exp_outputs, inputs):
assert ASCII85Decode.decode(i) == expected


def test_CCITParameters():
parms = CCITParameters()
assert parms.K == 0 # zero is the default according to page 78
assert parms.group == 3


@pytest.mark.parametrize(
("parameters", "expected_k"),
[
(None, 0),
(ArrayObject([{"/K": 1}, {"/Columns": 13}]), 1),
],
)
def test_CCIT_get_parameters(parameters, expected_k):
parmeters = CCITTFaxDecode._get_parameters(parameters=parameters, rows=0)
assert parmeters.K == expected_k


def test_CCITTFaxDecode():
data = b""
parameters = DictionaryObject(
{"/K": NumberObject(-1), "/Columns": NumberObject(17)}
)

# This was just the result PyPDF2 1.27.9 returned.
# It would be awesome if we could check if that is actually correct.
assert CCITTFaxDecode.decode(data, parameters) == (
b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00"
b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01"
b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00"
b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00"
b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01"
b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00"
b"\x00\x00\x00\x00\x00\x00\x00\x00"
)
14 changes: 14 additions & 0 deletions Tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
NumberObject,
PdfObject,
RectangleObject,
TextStringObject,
createStringObject,
encode_pdfdocencoding,
readHexStringFromStream,
Expand Down Expand Up @@ -327,3 +328,16 @@ def test_RectangleObject():

ro.upperRight = (13, 17)
assert ro.upperRight == (13, 17)


def test_TextStringObject_exc():
tso = TextStringObject("foo")
with pytest.raises(Exception) as exc:
tso.get_original_bytes()
assert exc.value.args[0] == "no information about original bytes"


def test_TextStringObject_autodetect_utf16():
tso = TextStringObject("foo")
tso.autodetect_utf16 = True
assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"
8 changes: 5 additions & 3 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,7 @@ def test_reader_properties():
assert reader.pageMode is None
assert reader.isEncrypted is False


@pytest.mark.parametrize(
"strict",
[(True), (False)],
Expand All @@ -564,14 +565,14 @@ def test_issue604(strict):
bookmarks = pdf.getOutlines()
if "Unknown Destination" not in exc.value.args[0]:
raise Exception("Expected exception not raised")
return # bookmarks not correct
return # bookmarks not correct
else:
pdf = PdfFileReader(f, strict=strict)
bookmarks = pdf.getOutlines()

def getDestPages(x):
# print(x)
if isinstance(x,list):
if isinstance(x, list):
r = [getDestPages(y) for y in x]
return r
else:
Expand All @@ -582,7 +583,8 @@ def getDestPages(x):
b
) in bookmarks: # b can be destination or a list:preferred to just print them
out.append(getDestPages(b))
#print(out)
# print(out)


def test_decode_permissions():
reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
Expand Down
2 changes: 1 addition & 1 deletion sample-files