diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 3659dd714..41162a421 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -74,12 +74,10 @@ if version_info < (3, 0): from cStringIO import StringIO -else: - from io import StringIO -if version_info < (3, 0): + BytesIO = StringIO else: - from io import BytesIO + from io import BytesIO, StringIO def convertToInt(d, size): @@ -567,7 +565,7 @@ def _getPageNumberByIndirect(self, indirectRef): self._pageId2Num = id2num if isinstance(indirectRef, NullObject): - return -1 + return -1 if isinstance(indirectRef, int): idnum = indirectRef else: @@ -613,10 +611,10 @@ def _buildDestination(self, title, array): if self.strict: raise else: - #create a link to first Page - return Destination(title, self.getPage(0).indirectRef, - TextStringObject("/Fit")) - + # create a link to first Page + return Destination( + title, self.getPage(0).indirectRef, TextStringObject("/Fit") + ) def _buildOutline(self, node): dest, title, outline = None, None, None diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 1d1d37b21..16713c28c 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -176,21 +176,21 @@ def _decode_png_prediction(data, columns): rowdata = [ ord_(x) for x in data[(row * rowlength) : ((row + 1) * rowlength)] ] - filterByte = rowdata[0] - if filterByte == 0: + filter_byte = rowdata[0] + if filter_byte == 0: pass - elif filterByte == 1: + elif filter_byte == 1: for i in range(2, rowlength): rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256 - elif filterByte == 2: + elif filter_byte == 2: for i in range(1, rowlength): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 - elif filterByte == 3: + elif filter_byte == 3: for i in range(1, rowlength): left = rowdata[i - 1] if i > 1 else 0 floor = math.floor(left + prev_rowdata[i]) / 2 rowdata[i] = (rowdata[i] + int(floor)) % 256 - elif filterByte == 4: + elif filter_byte == 4: for i in range(1, rowlength): left = rowdata[i - 1] if i > 1 else 0 up = prev_rowdata[i] @@ -199,7 +199,7 @@ def _decode_png_prediction(data, columns): rowdata[i] = (rowdata[i] + paeth) % 256 else: # unsupported PNG filter - raise PdfReadError("Unsupported PNG filter %r" % filterByte) + raise PdfReadError("Unsupported PNG filter %r" % filter_byte) prev_rowdata = rowdata output.write("".join([chr(x) for x in rowdata[1:]])) return output.getvalue() @@ -438,31 +438,65 @@ def decode(data, decodeParms=None): return data +class CCITParameters(object): + """TABLE 3.9 Optional parameters for the CCITTFaxDecode filter""" + + def __init__(self, K=0, columns=0, rows=0): + self.K = K + self.EndOfBlock = None + self.EndOfLine = None + self.EncodedByteAlign = None + self.columns = columns # width + self.rows = rows # height + self.DamagedRowsBeforeError = None + + @property + def group(self): + if self.K < 0: + CCITTgroup = 4 + else: + # k == 0: Pure one-dimensional encoding (Group 3, 1-D) + # k > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D) + CCITTgroup = 3 + return CCITTgroup + + class CCITTFaxDecode(object): + """ + See 3.3.5 CCITTFaxDecode Filter (PDF 1.7 Standard). + + Either Group 3 or Group 4 CCITT facsimile (fax) encoding. + CCITT encoding is bit-oriented, not byte-oriented. + + See: TABLE 3.9 Optional parameters for the CCITTFaxDecode filter + """ + @staticmethod - def decode(data, decodeParms=None, height=0): - k = 1 - width = 0 - if decodeParms: + def _get_parameters(parameters, rows): + k = 0 + columns = 0 + if parameters: from PyPDF2.generic import ArrayObject - if isinstance(decodeParms, ArrayObject): - for decodeParm in decodeParms: + if isinstance(parameters, ArrayObject): + for decodeParm in parameters: if CCITT.COLUMNS in decodeParm: - width = decodeParm[CCITT.COLUMNS] + columns = decodeParm[CCITT.COLUMNS] if CCITT.K in decodeParm: k = decodeParm[CCITT.K] else: - width = decodeParms[CCITT.COLUMNS] - k = decodeParms[CCITT.K] - if k == -1: - CCITTgroup = 4 - else: - CCITTgroup = 3 + columns = parameters[CCITT.COLUMNS] + k = parameters[CCITT.K] + + return CCITParameters(k, columns, rows) + + @staticmethod + def decode(data, decodeParms=None, height=0): + parms = CCITTFaxDecode._get_parameters(decodeParms, height) img_size = len(data) tiff_header_struct = "<2shlh" + "hhll" * 8 + "h" - tiffHeader = struct.pack( + tiff_header = struct.pack( tiff_header_struct, b"II", # Byte order indication: Little endian 42, # Version number (always 42) @@ -471,11 +505,11 @@ def decode(data, decodeParms=None, height=0): 256, 4, 1, - width, # ImageWidth, LONG, 1, width + parms.columns, # ImageWidth, LONG, 1, width 257, 4, 1, - height, # ImageLength, LONG, 1, length + parms.rows, # ImageLength, LONG, 1, length 258, 3, 1, @@ -483,7 +517,7 @@ def decode(data, decodeParms=None, height=0): 259, 3, 1, - CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding + parms.group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding 262, 3, 1, @@ -497,7 +531,7 @@ def decode(data, decodeParms=None, height=0): 278, 4, 1, - height, # RowsPerStrip, LONG, 1, length + parms.rows, # RowsPerStrip, LONG, 1, length 279, 4, 1, @@ -505,7 +539,7 @@ def decode(data, decodeParms=None, height=0): 0, # last IFD ) - return tiffHeader + data + return tiff_header + data def decodeStreamData(stream): diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index a505f04f1..c080aeab3 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -63,12 +63,10 @@ if version_info < (3, 0): from cStringIO import StringIO -else: - from io import StringIO -if version_info < (3, 0): + BytesIO = StringIO else: - from io import BytesIO + from io import BytesIO, StringIO logger = logging.getLogger(__name__) ObjectPrefix = b_("/<[tf(n%") diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 9523a16ea..7cfa25c10 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -42,13 +42,10 @@ if version_info < (3, 0): from cStringIO import StringIO -else: - from io import StringIO -if version_info < (3, 0): BytesIO = StringIO else: - from io import BytesIO # noqa: F401 + from io import StringIO, BytesIO # noqa: F401 import codecs # noqa: F401 import warnings # noqa: F401 diff --git a/Tests/test_filters.py b/Tests/test_filters.py index 7ccbb5e8e..6a135e894 100644 --- a/Tests/test_filters.py +++ b/Tests/test_filters.py @@ -5,7 +5,14 @@ import pytest from PyPDF2.errors import PdfReadError, PdfStreamError -from PyPDF2.filters import ASCII85Decode, ASCIIHexDecode, FlateDecode +from PyPDF2.filters import ( + ASCII85Decode, + ASCIIHexDecode, + CCITParameters, + CCITTFaxDecode, + FlateDecode, +) +from PyPDF2.generic import ArrayObject, DictionaryObject, NumberObject filter_inputs = ( # "", '', """""", @@ -140,3 +147,40 @@ def test_ASCII85Decode_five_zero_bytes(): for expected, i in zip(exp_outputs, inputs): assert ASCII85Decode.decode(i) == expected + + +def test_CCITParameters(): + parms = CCITParameters() + assert parms.K == 0 # zero is the default according to page 78 + assert parms.group == 3 + + +@pytest.mark.parametrize( + ("parameters", "expected_k"), + [ + (None, 0), + (ArrayObject([{"/K": 1}, {"/Columns": 13}]), 1), + ], +) +def test_CCIT_get_parameters(parameters, expected_k): + parmeters = CCITTFaxDecode._get_parameters(parameters=parameters, rows=0) + assert parmeters.K == expected_k + + +def test_CCITTFaxDecode(): + data = b"" + parameters = DictionaryObject( + {"/K": NumberObject(-1), "/Columns": NumberObject(17)} + ) + + # This was just the result PyPDF2 1.27.9 returned. + # It would be awesome if we could check if that is actually correct. + assert CCITTFaxDecode.decode(data, parameters) == ( + b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00" + b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01" + b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00" + b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00" + b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01" + b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00" + b"\x00\x00\x00\x00\x00\x00\x00\x00" + ) diff --git a/Tests/test_generic.py b/Tests/test_generic.py index 18cad22db..0b3956eeb 100644 --- a/Tests/test_generic.py +++ b/Tests/test_generic.py @@ -19,6 +19,7 @@ NumberObject, PdfObject, RectangleObject, + TextStringObject, createStringObject, encode_pdfdocencoding, readHexStringFromStream, @@ -327,3 +328,16 @@ def test_RectangleObject(): ro.upperRight = (13, 17) assert ro.upperRight == (13, 17) + + +def test_TextStringObject_exc(): + tso = TextStringObject("foo") + with pytest.raises(Exception) as exc: + tso.get_original_bytes() + assert exc.value.args[0] == "no information about original bytes" + + +def test_TextStringObject_autodetect_utf16(): + tso = TextStringObject("foo") + tso.autodetect_utf16 = True + assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o" diff --git a/Tests/test_reader.py b/Tests/test_reader.py index ece58598c..0736a72fa 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -547,6 +547,7 @@ def test_reader_properties(): assert reader.pageMode is None assert reader.isEncrypted is False + @pytest.mark.parametrize( "strict", [(True), (False)], @@ -564,14 +565,14 @@ def test_issue604(strict): bookmarks = pdf.getOutlines() if "Unknown Destination" not in exc.value.args[0]: raise Exception("Expected exception not raised") - return # bookmarks not correct + return # bookmarks not correct else: pdf = PdfFileReader(f, strict=strict) bookmarks = pdf.getOutlines() def getDestPages(x): # print(x) - if isinstance(x,list): + if isinstance(x, list): r = [getDestPages(y) for y in x] return r else: @@ -582,7 +583,8 @@ def getDestPages(x): b ) in bookmarks: # b can be destination or a list:preferred to just print them out.append(getDestPages(b)) - #print(out) + # print(out) + def test_decode_permissions(): reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) diff --git a/sample-files b/sample-files index 99e32fff7..41b5cd4f7 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 99e32fff7002b5308e84696547010c249e5cfb0a +Subproject commit 41b5cd4f774f8fbd8ac42d93b9962f0376352a15