Skip to content

Commit

Permalink
Add test cases for filters.ASCIIHexDecode.
Browse files Browse the repository at this point in the history
  • Loading branch information
acsor committed Aug 12, 2018
1 parent 9f628b3 commit 5c74416
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 25 deletions.
48 changes: 35 additions & 13 deletions PyPDF4/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import math
from sys import version_info

from .utils import PdfReadError, ord_, paethPredictor
from .utils import PdfReadError, ord_, paethPredictor, PdfStreamError

if version_info < (3, 0):
from cStringIO import StringIO
Expand Down Expand Up @@ -205,31 +205,53 @@ def encode(data):


class ASCIIHexDecode(object):
"""
The ASCIIHexDecode filter decodes data that has been encoded in ASCII
hexadecimal form into a base-7 ASCII format.
"""
def decode(data, decodeParms=None):
"""
:param data: a str sequence of hexadecimal-encoded values to be
converted into a base-7 ASCII string
:param decodeParms:
:return: a string conversion in base-7 ASCII, where each of its values
v is such that 0 <= ord(v) <= 127.
"""
retval = ""
char = ""
x = 0

while True:
c = data[x]
hex_pair = ""
eod_found = False

for c in data:
if c == ">":
# If the filter encounters the EOD marker after reading an odd
# number of hexadecimal digits, it shall behave as if a 0
# (zero) followed the last digit - from ISO 32000 specification
if len(hex_pair) == 1:
hex_pair += "0"
retval += chr(int(hex_pair, base=16))
hex_pair = ""

eod_found = True
break
elif c.isspace():
x += 1
continue

char += c
hex_pair += c

if len(hex_pair) == 2:
retval += chr(int(hex_pair, base=16))
hex_pair = ""

if len(char) == 2:
retval += chr(int(char, base=16))
char = ""
if not eod_found:
raise PdfStreamError("Ending character '>' not found in stream")

x += 1
assert hex_pair == ""

assert char == ""
return retval

def encode(data):
pass

decode = staticmethod(decode)


Expand Down
2 changes: 1 addition & 1 deletion PyPDF4/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def writeToStream(self, stream, encryption_key):
if encryption_key:
bytearr = RC4_encrypt(encryption_key, bytearr)
stream.write(b_("<"))
stream.write(utils.hexencode(bytearr))
stream.write(b_(utils.hexencode(bytearr)))
stream.write(b_(">"))


Expand Down
2 changes: 1 addition & 1 deletion PyPDF4/pagerange.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Copyright (c) 2014, Steve Witham <[email protected]>.
All rights reserved. This software is available under a BSD license;
see https://github.com/claird/PyPDF4/blob/master/LICENSE
see https://github.com/claird/PyPDF4/blob/master/LICENSE.md
"""

import re
Expand Down
7 changes: 4 additions & 3 deletions PyPDF4/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,10 +1234,11 @@ def getPage(self, pageNumber):
:return: a :class:`PageObject<pdf.PageObject>` instance.
:rtype: :class:`PageObject<pdf.PageObject>`
"""
## ensure that we're not trying to access an encrypted PDF
#assert not self.trailer.has_key("/Encrypt")
if self.flattenedPages == None:
# Ensure that we're not trying to access an encrypted PDF
# assert not self.trailer.has_key("/Encrypt")
if self.flattenedPages is None:
self._flatten()

return self.flattenedPages[pageNumber]

namedDestinations = property(lambda self:
Expand Down
16 changes: 12 additions & 4 deletions PyPDF4/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,13 +279,21 @@ def barray(b):
return bytearray(b)


def hexencode(b):
def hexencode(s):
"""
Converts a string s to an hexadecimal string representation.
:param s: a string to convert from UTF-8 characters to a hexadecimal string
representation.
:return: a hex-encoded string, e.g. hexencode("AA") == "4141".
"""
if sys.version_info[0] < 3:
return b.encode('hex')
return s.encode('hex')
else:
import codecs
coder = codecs.getencoder('hex_codec')
return coder(b)[0]
e = codecs.getencoder('hex_codec')

return e(s.encode("utf-8"))[0].decode("utf-8")


def hexStr(num):
Expand Down
63 changes: 60 additions & 3 deletions Tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
import unittest

from itertools import product as cartesian_product
from unittest import skip

from PyPDF4.filters import FlateDecode
from PyPDF4.utils import PdfReadError
from PyPDF4.filters import FlateDecode, ASCIIHexDecode
from PyPDF4.utils import PdfReadError, PdfStreamError


class FlateDecodeTestCase(unittest.TestCase):
Expand All @@ -20,7 +21,7 @@ class FlateDecodeTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.filter_inputs = (
# "", '', """""",
"", '', """""",
string.ascii_lowercase, string.ascii_uppercase,
string.ascii_letters, string.digits, string.hexdigits,
string.punctuation, string.whitespace, # Add more...
Expand Down Expand Up @@ -63,5 +64,61 @@ def test_unsupported_predictor(self):
codec.decode(codec.encode(s), {"/Predictor": predictor})


class ASCIIHexDecodeTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.filter_inputs = (
"", '', """""",
">", ">>", ">>>",
string.ascii_lowercase, string.ascii_uppercase,
string.ascii_letters, string.digits, string.hexdigits,
string.punctuation, string.whitespace, # Add more...
)

def test_expected_results(self):
"""
Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the
correct output is returned.
TO-DO What is decode() supposed to do for such inputs as ">>", ">>>" or
any other not terminated by ">"? (For the latter case, an exception
is currently raised.)
"""
inputs = (
">", "6162636465666768696a6b6c6d6e6f707172737475767778797a>",
"4142434445464748494a4b4c4d4e4f505152535455565758595a>",
"6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464"
"748494a4b4c4d4e4f505152535455565758595a>",
"30313233343536373839>",
"3 031323334353637 3839>", # Same as previous, but whitespaced
"30313233343536373839616263646566414243444546>", "20090a0d0b0c>",
)
expected_outputs = (
"", string.ascii_lowercase, string.ascii_uppercase,
string.ascii_letters, string.digits, string.digits,
string.hexdigits, string.whitespace
)

for i, o in zip(inputs, expected_outputs):
self.assertEqual(
ASCIIHexDecode.decode(i), o,
msg="i = %s" % i
)
# print(
# "ASCIIHexDecode.decode(%s) == %s" % (i, ASCIIHexDecode.decode(i))
# )


def test_no_eod(self):
"""
Tests when no EOD character is present, ensuring an exception is raised
"""
inputs = ("", '', """""", '''''')

for i in inputs:
with self.assertRaises(PdfStreamError):
ASCIIHexDecode.decode(i)


if __name__ == "__main__":
unittest.main(FlateDecodeTestCase)

0 comments on commit 5c74416

Please sign in to comment.