From 5c74416e6cb9675628975f12528ca908c554bb63 Mon Sep 17 00:00:00 2001 From: Acsor Date: Sun, 12 Aug 2018 20:20:16 +0200 Subject: [PATCH] Add test cases for `filters.ASCIIHexDecode`. --- PyPDF4/filters.py | 48 ++++++++++++++++++++++++--------- PyPDF4/generic.py | 2 +- PyPDF4/pagerange.py | 2 +- PyPDF4/pdf.py | 7 ++--- PyPDF4/utils.py | 16 ++++++++--- Tests/test_filters.py | 63 ++++++++++++++++++++++++++++++++++++++++--- 6 files changed, 113 insertions(+), 25 deletions(-) diff --git a/PyPDF4/filters.py b/PyPDF4/filters.py index e129a1c7d..49a0e2ecb 100644 --- a/PyPDF4/filters.py +++ b/PyPDF4/filters.py @@ -38,7 +38,7 @@ import math from sys import version_info -from .utils import PdfReadError, ord_, paethPredictor +from .utils import PdfReadError, ord_, paethPredictor, PdfStreamError if version_info < (3, 0): from cStringIO import StringIO @@ -205,31 +205,53 @@ def encode(data): class ASCIIHexDecode(object): + """ + The ASCIIHexDecode filter decodes data that has been encoded in ASCII + hexadecimal form into a base-7 ASCII format. + """ def decode(data, decodeParms=None): + """ + :param data: a str sequence of hexadecimal-encoded values to be + converted into a base-7 ASCII string + :param decodeParms: + :return: a string conversion in base-7 ASCII, where each of its values + v is such that 0 <= ord(v) <= 127. + """ retval = "" - char = "" - x = 0 - - while True: - c = data[x] + hex_pair = "" + eod_found = False + for c in data: if c == ">": + # If the filter encounters the EOD marker after reading an odd + # number of hexadecimal digits, it shall behave as if a 0 + # (zero) followed the last digit - from ISO 32000 specification + if len(hex_pair) == 1: + hex_pair += "0" + retval += chr(int(hex_pair, base=16)) + hex_pair = "" + + eod_found = True break elif c.isspace(): - x += 1 continue - char += c + hex_pair += c + + if len(hex_pair) == 2: + retval += chr(int(hex_pair, base=16)) + hex_pair = "" - if len(char) == 2: - retval += chr(int(char, base=16)) - char = "" + if not eod_found: + raise PdfStreamError("Ending character '>' not found in stream") - x += 1 + assert hex_pair == "" - assert char == "" return retval + def encode(data): + pass + decode = staticmethod(decode) diff --git a/PyPDF4/generic.py b/PyPDF4/generic.py index 11c490baa..33876f2ca 100644 --- a/PyPDF4/generic.py +++ b/PyPDF4/generic.py @@ -416,7 +416,7 @@ def writeToStream(self, stream, encryption_key): if encryption_key: bytearr = RC4_encrypt(encryption_key, bytearr) stream.write(b_("<")) - stream.write(utils.hexencode(bytearr)) + stream.write(b_(utils.hexencode(bytearr))) stream.write(b_(">")) diff --git a/PyPDF4/pagerange.py b/PyPDF4/pagerange.py index b3d410fe7..ba56d6794 100644 --- a/PyPDF4/pagerange.py +++ b/PyPDF4/pagerange.py @@ -4,7 +4,7 @@ Copyright (c) 2014, Steve Witham . All rights reserved. This software is available under a BSD license; -see https://github.com/claird/PyPDF4/blob/master/LICENSE +see https://github.com/claird/PyPDF4/blob/master/LICENSE.md """ import re diff --git a/PyPDF4/pdf.py b/PyPDF4/pdf.py index 8bfd3125f..348980ec8 100644 --- a/PyPDF4/pdf.py +++ b/PyPDF4/pdf.py @@ -1234,10 +1234,11 @@ def getPage(self, pageNumber): :return: a :class:`PageObject` instance. :rtype: :class:`PageObject` """ - ## ensure that we're not trying to access an encrypted PDF - #assert not self.trailer.has_key("/Encrypt") - if self.flattenedPages == None: + # Ensure that we're not trying to access an encrypted PDF + # assert not self.trailer.has_key("/Encrypt") + if self.flattenedPages is None: self._flatten() + return self.flattenedPages[pageNumber] namedDestinations = property(lambda self: diff --git a/PyPDF4/utils.py b/PyPDF4/utils.py index 5378a9949..a7bd1793a 100644 --- a/PyPDF4/utils.py +++ b/PyPDF4/utils.py @@ -279,13 +279,21 @@ def barray(b): return bytearray(b) -def hexencode(b): +def hexencode(s): + """ + Converts a string s to an hexadecimal string representation. + + :param s: a string to convert from UTF-8 characters to a hexadecimal string + representation. + :return: a hex-encoded string, e.g. hexencode("AA") == "4141". + """ if sys.version_info[0] < 3: - return b.encode('hex') + return s.encode('hex') else: import codecs - coder = codecs.getencoder('hex_codec') - return coder(b)[0] + e = codecs.getencoder('hex_codec') + + return e(s.encode("utf-8"))[0].decode("utf-8") def hexStr(num): diff --git a/Tests/test_filters.py b/Tests/test_filters.py index 9681ef948..05db24b97 100644 --- a/Tests/test_filters.py +++ b/Tests/test_filters.py @@ -8,9 +8,10 @@ import unittest from itertools import product as cartesian_product +from unittest import skip -from PyPDF4.filters import FlateDecode -from PyPDF4.utils import PdfReadError +from PyPDF4.filters import FlateDecode, ASCIIHexDecode +from PyPDF4.utils import PdfReadError, PdfStreamError class FlateDecodeTestCase(unittest.TestCase): @@ -20,7 +21,7 @@ class FlateDecodeTestCase(unittest.TestCase): @classmethod def setUpClass(cls): cls.filter_inputs = ( - # "", '', """""", + "", '', """""", string.ascii_lowercase, string.ascii_uppercase, string.ascii_letters, string.digits, string.hexdigits, string.punctuation, string.whitespace, # Add more... @@ -63,5 +64,61 @@ def test_unsupported_predictor(self): codec.decode(codec.encode(s), {"/Predictor": predictor}) +class ASCIIHexDecodeTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.filter_inputs = ( + "", '', """""", + ">", ">>", ">>>", + string.ascii_lowercase, string.ascii_uppercase, + string.ascii_letters, string.digits, string.hexdigits, + string.punctuation, string.whitespace, # Add more... + ) + + def test_expected_results(self): + """ + Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the + correct output is returned. + + TO-DO What is decode() supposed to do for such inputs as ">>", ">>>" or + any other not terminated by ">"? (For the latter case, an exception + is currently raised.) + """ + inputs = ( + ">", "6162636465666768696a6b6c6d6e6f707172737475767778797a>", + "4142434445464748494a4b4c4d4e4f505152535455565758595a>", + "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464" + "748494a4b4c4d4e4f505152535455565758595a>", + "30313233343536373839>", + "3 031323334353637 3839>", # Same as previous, but whitespaced + "30313233343536373839616263646566414243444546>", "20090a0d0b0c>", + ) + expected_outputs = ( + "", string.ascii_lowercase, string.ascii_uppercase, + string.ascii_letters, string.digits, string.digits, + string.hexdigits, string.whitespace + ) + + for i, o in zip(inputs, expected_outputs): + self.assertEqual( + ASCIIHexDecode.decode(i), o, + msg="i = %s" % i + ) + # print( + # "ASCIIHexDecode.decode(%s) == %s" % (i, ASCIIHexDecode.decode(i)) + # ) + + + def test_no_eod(self): + """ + Tests when no EOD character is present, ensuring an exception is raised + """ + inputs = ("", '', """""", '''''') + + for i in inputs: + with self.assertRaises(PdfStreamError): + ASCIIHexDecode.decode(i) + + if __name__ == "__main__": unittest.main(FlateDecodeTestCase)