MAINT: Separated CCITTFax param parsing/decoding

* BUG: Changed default /K to conform with the PDF 1.7 standard * TST: Add test for CCITTFax * TST: Add test for TextStringObject STY: * Group Python 2.7 imports * camelCase variables to snake_case * Apply black formatter
py-pdf · May 1, 2022 · 73d6f7a · 73d6f7a
1 parent 444fca2
commit 73d6f7a
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 48 deletions.
diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py
@@ -74,12 +74,10 @@
 
 if version_info < (3, 0):
     from cStringIO import StringIO
-else:
-    from io import StringIO
-if version_info < (3, 0):
+
     BytesIO = StringIO
 else:
-    from io import BytesIO
+    from io import BytesIO, StringIO
 
 
 def convertToInt(d, size):
@@ -567,7 +565,7 @@ def _getPageNumberByIndirect(self, indirectRef):
             self._pageId2Num = id2num
 
         if isinstance(indirectRef, NullObject):
-             return -1
+            return -1
         if isinstance(indirectRef, int):
             idnum = indirectRef
         else:
@@ -613,10 +611,10 @@ def _buildDestination(self, title, array):
             if self.strict:
                 raise
             else:
-                #create a link to first Page
-                return Destination(title, self.getPage(0).indirectRef,
-                                   TextStringObject("/Fit"))
-
+                # create a link to first Page
+                return Destination(
+                    title, self.getPage(0).indirectRef, TextStringObject("/Fit")
+                )
 
     def _buildOutline(self, node):
         dest, title, outline = None, None, None

diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
@@ -176,21 +176,21 @@ def _decode_png_prediction(data, columns):
             rowdata = [
                 ord_(x) for x in data[(row * rowlength) : ((row + 1) * rowlength)]
             ]
-            filterByte = rowdata[0]
-            if filterByte == 0:
+            filter_byte = rowdata[0]
+            if filter_byte == 0:
                 pass
-            elif filterByte == 1:
+            elif filter_byte == 1:
                 for i in range(2, rowlength):
                     rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256
-            elif filterByte == 2:
+            elif filter_byte == 2:
                 for i in range(1, rowlength):
                     rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
-            elif filterByte == 3:
+            elif filter_byte == 3:
                 for i in range(1, rowlength):
                     left = rowdata[i - 1] if i > 1 else 0
                     floor = math.floor(left + prev_rowdata[i]) / 2
                     rowdata[i] = (rowdata[i] + int(floor)) % 256
-            elif filterByte == 4:
+            elif filter_byte == 4:
                 for i in range(1, rowlength):
                     left = rowdata[i - 1] if i > 1 else 0
                     up = prev_rowdata[i]
@@ -199,7 +199,7 @@ def _decode_png_prediction(data, columns):
                     rowdata[i] = (rowdata[i] + paeth) % 256
             else:
                 # unsupported PNG filter
-                raise PdfReadError("Unsupported PNG filter %r" % filterByte)
+                raise PdfReadError("Unsupported PNG filter %r" % filter_byte)
             prev_rowdata = rowdata
             output.write("".join([chr(x) for x in rowdata[1:]]))
         return output.getvalue()
@@ -438,31 +438,65 @@ def decode(data, decodeParms=None):
         return data
 
 
+class CCITParameters(object):
+    """TABLE 3.9 Optional parameters for the CCITTFaxDecode filter"""
+
+    def __init__(self, K=0, columns=0, rows=0):
+        self.K = K
+        self.EndOfBlock = None
+        self.EndOfLine = None
+        self.EncodedByteAlign = None
+        self.columns = columns  # width
+        self.rows = rows  # height
+        self.DamagedRowsBeforeError = None
+
+    @property
+    def group(self):
+        if self.K < 0:
+            CCITTgroup = 4
+        else:
+            # k == 0: Pure one-dimensional encoding (Group 3, 1-D)
+            # k > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
+            CCITTgroup = 3
+        return CCITTgroup
+
+
 class CCITTFaxDecode(object):
+    """
+    See 3.3.5 CCITTFaxDecode Filter (PDF 1.7 Standard).
+
+    Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
+    CCITT encoding is bit-oriented, not byte-oriented.
+
+    See: TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
+    """
+
     @staticmethod
-    def decode(data, decodeParms=None, height=0):
-        k = 1
-        width = 0
-        if decodeParms:
+    def _get_parameters(parameters, rows):
+        k = 0
+        columns = 0
+        if parameters:
             from PyPDF2.generic import ArrayObject
 
-            if isinstance(decodeParms, ArrayObject):
-                for decodeParm in decodeParms:
+            if isinstance(parameters, ArrayObject):
+                for decodeParm in parameters:
                     if CCITT.COLUMNS in decodeParm:
-                        width = decodeParm[CCITT.COLUMNS]
+                        columns = decodeParm[CCITT.COLUMNS]
                     if CCITT.K in decodeParm:
                         k = decodeParm[CCITT.K]
             else:
-                width = decodeParms[CCITT.COLUMNS]
-                k = decodeParms[CCITT.K]
-            if k == -1:
-                CCITTgroup = 4
-            else:
-                CCITTgroup = 3
+                columns = parameters[CCITT.COLUMNS]
+                k = parameters[CCITT.K]
+
+        return CCITParameters(k, columns, rows)
+
+    @staticmethod
+    def decode(data, decodeParms=None, height=0):
+        parms = CCITTFaxDecode._get_parameters(decodeParms, height)
 
         img_size = len(data)
         tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
-        tiffHeader = struct.pack(
+        tiff_header = struct.pack(
             tiff_header_struct,
             b"II",  # Byte order indication: Little endian
             42,  # Version number (always 42)
@@ -471,19 +505,19 @@ def decode(data, decodeParms=None, height=0):
             256,
             4,
             1,
-            width,  # ImageWidth, LONG, 1, width
+            parms.columns,  # ImageWidth, LONG, 1, width
             257,
             4,
             1,
-            height,  # ImageLength, LONG, 1, length
+            parms.rows,  # ImageLength, LONG, 1, length
             258,
             3,
             1,
             1,  # BitsPerSample, SHORT, 1, 1
             259,
             3,
             1,
-            CCITTgroup,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
+            parms.group,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
             262,
             3,
             1,
@@ -497,15 +531,15 @@ def decode(data, decodeParms=None, height=0):
             278,
             4,
             1,
-            height,  # RowsPerStrip, LONG, 1, length
+            parms.rows,  # RowsPerStrip, LONG, 1, length
             279,
             4,
             1,
             img_size,  # StripByteCounts, LONG, 1, size of image
             0,  # last IFD
         )
 
-        return tiffHeader + data
+        return tiff_header + data
 
 
 def decodeStreamData(stream):

diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -63,12 +63,10 @@
 
 if version_info < (3, 0):
     from cStringIO import StringIO
-else:
-    from io import StringIO
-if version_info < (3, 0):
+
     BytesIO = StringIO
 else:
-    from io import BytesIO
+    from io import BytesIO, StringIO
 
 logger = logging.getLogger(__name__)
 ObjectPrefix = b_("/<[tf(n%")

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -42,13 +42,10 @@
 
 if version_info < (3, 0):
     from cStringIO import StringIO
-else:
-    from io import StringIO
 
-if version_info < (3, 0):
     BytesIO = StringIO
 else:
-    from io import BytesIO  # noqa: F401
+    from io import StringIO, BytesIO  # noqa: F401
 
 import codecs  # noqa: F401
 import warnings  # noqa: F401

diff --git a/Tests/test_filters.py b/Tests/test_filters.py
@@ -5,7 +5,14 @@
 import pytest
 
 from PyPDF2.errors import PdfReadError, PdfStreamError
-from PyPDF2.filters import ASCII85Decode, ASCIIHexDecode, FlateDecode
+from PyPDF2.filters import (
+    ASCII85Decode,
+    ASCIIHexDecode,
+    CCITParameters,
+    CCITTFaxDecode,
+    FlateDecode,
+)
+from PyPDF2.generic import ArrayObject, DictionaryObject, NumberObject
 
 filter_inputs = (
     # "", '', """""",
@@ -140,3 +147,40 @@ def test_ASCII85Decode_five_zero_bytes():
 
     for expected, i in zip(exp_outputs, inputs):
         assert ASCII85Decode.decode(i) == expected
+
+
+def test_CCITParameters():
+    parms = CCITParameters()
+    assert parms.K == 0  # zero is the default according to page 78
+    assert parms.group == 3
+
+
+@pytest.mark.parametrize(
+    ("parameters", "expected_k"),
+    [
+        (None, 0),
+        (ArrayObject([{"/K": 1}, {"/Columns": 13}]), 1),
+    ],
+)
+def test_CCIT_get_parameters(parameters, expected_k):
+    parmeters = CCITTFaxDecode._get_parameters(parameters=parameters, rows=0)
+    assert parmeters.K == expected_k
+
+
+def test_CCITTFaxDecode():
+    data = b""
+    parameters = DictionaryObject(
+        {"/K": NumberObject(-1), "/Columns": NumberObject(17)}
+    )
+
+    # This was just the result PyPDF2 1.27.9 returned.
+    # It would be awesome if we could check if that is actually correct.
+    assert CCITTFaxDecode.decode(data, parameters) == (
+        b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00"
+        b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01"
+        b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00"
+        b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00"
+        b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01"
+        b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00"
+        b"\x00\x00\x00\x00\x00\x00\x00\x00"
+    )
diff --git a/Tests/test_generic.py b/Tests/test_generic.py
@@ -19,6 +19,7 @@
     NumberObject,
     PdfObject,
     RectangleObject,
+    TextStringObject,
     createStringObject,
     encode_pdfdocencoding,
     readHexStringFromStream,
@@ -327,3 +328,16 @@ def test_RectangleObject():
 
     ro.upperRight = (13, 17)
     assert ro.upperRight == (13, 17)
+
+
+def test_TextStringObject_exc():
+    tso = TextStringObject("foo")
+    with pytest.raises(Exception) as exc:
+        tso.get_original_bytes()
+    assert exc.value.args[0] == "no information about original bytes"
+
+
+def test_TextStringObject_autodetect_utf16():
+    tso = TextStringObject("foo")
+    tso.autodetect_utf16 = True
+    assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -547,6 +547,7 @@ def test_reader_properties():
     assert reader.pageMode is None
     assert reader.isEncrypted is False
 
+
 @pytest.mark.parametrize(
     "strict",
     [(True), (False)],
@@ -564,14 +565,14 @@ def test_issue604(strict):
                 bookmarks = pdf.getOutlines()
             if "Unknown Destination" not in exc.value.args[0]:
                 raise Exception("Expected exception not raised")
-            return # bookmarks not correct
+            return  # bookmarks not correct
         else:
             pdf = PdfFileReader(f, strict=strict)
             bookmarks = pdf.getOutlines()
 
         def getDestPages(x):
             # print(x)
-            if isinstance(x,list):
+            if isinstance(x, list):
                 r = [getDestPages(y) for y in x]
                 return r
             else:
@@ -582,7 +583,8 @@ def getDestPages(x):
             b
         ) in bookmarks:  # b can be destination or a list:preferred to just print them
             out.append(getDestPages(b))
-    #print(out)
+    # print(out)
+
 
 def test_decode_permissions():
     reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))

diff --git a/sample-files b/sample-files
+4 −1		.github/workflows/json_consistency.py
+402 −29		LICENSE
+6 −0		README.md