Add test cases for filters.ASCIIHexDecode.

py-pdf · Aug 12, 2018 · 5c74416 · 5c74416
1 parent 9f628b3
commit 5c74416
Show file tree

Hide file tree

Showing 6 changed files with 113 additions and 25 deletions.
diff --git a/PyPDF4/filters.py b/PyPDF4/filters.py
@@ -38,7 +38,7 @@
 import math
 from sys import version_info
 
-from .utils import PdfReadError, ord_, paethPredictor
+from .utils import PdfReadError, ord_, paethPredictor, PdfStreamError
 
 if version_info < (3, 0):
     from cStringIO import StringIO
@@ -205,31 +205,53 @@ def encode(data):
 
 
 class ASCIIHexDecode(object):
+    """
+        The ASCIIHexDecode filter decodes data that has been encoded in ASCII
+        hexadecimal form into a base-7 ASCII format.
+    """
     def decode(data, decodeParms=None):
+        """
+        :param data: a str sequence of hexadecimal-encoded values to be
+            converted into a base-7 ASCII string
+        :param decodeParms:
+        :return: a string conversion in base-7 ASCII, where each of its values
+            v is such that 0 <= ord(v) <= 127.
+        """
         retval = ""
-        char = ""
-        x = 0
-
-        while True:
-            c = data[x]
+        hex_pair = ""
+        eod_found = False
 
+        for c in data:
             if c == ">":
+                # If the filter encounters the EOD marker after reading an odd
+                # number of hexadecimal digits, it shall behave as if a 0
+                # (zero) followed the last digit - from ISO 32000 specification
+                if len(hex_pair) == 1:
+                    hex_pair += "0"
+                    retval += chr(int(hex_pair, base=16))
+                    hex_pair = ""
+
+                eod_found = True
                 break
             elif c.isspace():
-                x += 1
                 continue
 
-            char += c
+            hex_pair += c
+
+            if len(hex_pair) == 2:
+                retval += chr(int(hex_pair, base=16))
+                hex_pair = ""
 
-            if len(char) == 2:
-                retval += chr(int(char, base=16))
-                char = ""
+        if not eod_found:
+            raise PdfStreamError("Ending character '>' not found in stream")
 
-            x += 1
+        assert hex_pair == ""
 
-        assert char == ""
         return retval
 
+    def encode(data):
+        pass
+
     decode = staticmethod(decode)
 
 

diff --git a/PyPDF4/generic.py b/PyPDF4/generic.py
@@ -416,7 +416,7 @@ def writeToStream(self, stream, encryption_key):
         if encryption_key:
             bytearr = RC4_encrypt(encryption_key, bytearr)
         stream.write(b_("<"))
-        stream.write(utils.hexencode(bytearr))
+        stream.write(b_(utils.hexencode(bytearr)))
         stream.write(b_(">"))
 
 

diff --git a/PyPDF4/pagerange.py b/PyPDF4/pagerange.py
@@ -4,7 +4,7 @@
 
 Copyright (c) 2014, Steve Witham <[email protected]>.
 All rights reserved. This software is available under a BSD license;
-see https://github.com/claird/PyPDF4/blob/master/LICENSE
+see https://github.com/claird/PyPDF4/blob/master/LICENSE.md
 """
 
 import re

diff --git a/PyPDF4/pdf.py b/PyPDF4/pdf.py
@@ -1234,10 +1234,11 @@ def getPage(self, pageNumber):
         :return: a :class:`PageObject<pdf.PageObject>` instance.
         :rtype: :class:`PageObject<pdf.PageObject>`
         """
-        ## ensure that we're not trying to access an encrypted PDF
-        #assert not self.trailer.has_key("/Encrypt")
-        if self.flattenedPages == None:
+        # Ensure that we're not trying to access an encrypted PDF
+        # assert not self.trailer.has_key("/Encrypt")
+        if self.flattenedPages is None:
             self._flatten()
+
         return self.flattenedPages[pageNumber]
 
     namedDestinations = property(lambda self:

diff --git a/PyPDF4/utils.py b/PyPDF4/utils.py
@@ -279,13 +279,21 @@ def barray(b):
         return bytearray(b)
 
 
-def hexencode(b):
+def hexencode(s):
+    """
+    Converts a string s to an hexadecimal string representation.
+
+    :param s: a string to convert from UTF-8 characters to a hexadecimal string
+        representation.
+    :return: a hex-encoded string, e.g. hexencode("AA") == "4141".
+    """
     if sys.version_info[0] < 3:
-        return b.encode('hex')
+        return s.encode('hex')
     else:
         import codecs
-        coder = codecs.getencoder('hex_codec')
-        return coder(b)[0]
+        e = codecs.getencoder('hex_codec')
+
+        return e(s.encode("utf-8"))[0].decode("utf-8")
 
 
 def hexStr(num):

diff --git a/Tests/test_filters.py b/Tests/test_filters.py
@@ -8,9 +8,10 @@
 import unittest
 
 from itertools import product as cartesian_product
+from unittest import skip
 
-from PyPDF4.filters import FlateDecode
-from PyPDF4.utils import PdfReadError
+from PyPDF4.filters import FlateDecode, ASCIIHexDecode
+from PyPDF4.utils import PdfReadError, PdfStreamError
 
 
 class FlateDecodeTestCase(unittest.TestCase):
@@ -20,7 +21,7 @@ class FlateDecodeTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.filter_inputs = (
-            # "", '', """""",
+            "", '', """""",
             string.ascii_lowercase, string.ascii_uppercase,
             string.ascii_letters, string.digits, string.hexdigits,
             string.punctuation, string.whitespace,  # Add more...
@@ -63,5 +64,61 @@ def test_unsupported_predictor(self):
                 codec.decode(codec.encode(s), {"/Predictor": predictor})
 
 
+class ASCIIHexDecodeTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.filter_inputs = (
+            "", '', """""",
+            ">", ">>", ">>>",
+            string.ascii_lowercase, string.ascii_uppercase,
+            string.ascii_letters, string.digits, string.hexdigits,
+            string.punctuation, string.whitespace,  # Add more...
+        )
+
+    def test_expected_results(self):
+        """
+        Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the
+        correct output is returned.
+
+        TO-DO What is decode() supposed to do for such inputs as ">>", ">>>" or
+        any other not terminated by ">"? (For the latter case, an exception
+        is currently raised.)
+        """
+        inputs = (
+            ">", "6162636465666768696a6b6c6d6e6f707172737475767778797a>",
+            "4142434445464748494a4b4c4d4e4f505152535455565758595a>",
+            "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464"
+            "748494a4b4c4d4e4f505152535455565758595a>",
+            "30313233343536373839>",
+            "3  031323334353637   3839>",  # Same as previous, but whitespaced
+            "30313233343536373839616263646566414243444546>", "20090a0d0b0c>",
+        )
+        expected_outputs = (
+            "", string.ascii_lowercase, string.ascii_uppercase,
+            string.ascii_letters, string.digits, string.digits,
+            string.hexdigits, string.whitespace
+        )
+
+        for i, o in zip(inputs, expected_outputs):
+            self.assertEqual(
+                ASCIIHexDecode.decode(i), o,
+                msg="i = %s" % i
+            )
+            # print(
+            #     "ASCIIHexDecode.decode(%s) == %s" % (i, ASCIIHexDecode.decode(i))
+            # )
+
+
+    def test_no_eod(self):
+        """
+        Tests when no EOD character is present, ensuring an exception is raised
+        """
+        inputs = ("", '', """""", '''''')
+
+        for i in inputs:
+            with self.assertRaises(PdfStreamError):
+                ASCIIHexDecode.decode(i)
+
+
 if __name__ == "__main__":
     unittest.main(FlateDecodeTestCase)