Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial support for CMap character translation #201

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 43 additions & 16 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import math
import struct
import sys
import re
from sys import version_info
if version_info < ( 3, 0 ):
from cStringIO import StringIO
Expand Down Expand Up @@ -1860,6 +1861,17 @@ def createRectangleAccessor(name, fallback):
lambda self: deleteRectangle(self, name)
)

def parseCMap(cstr):
rr = re.search("\nbegincmap\n(?:.*?\n)?[0-9]* beginbfchar\n(.*?)\nendbfchar\n(?:.*?\n)?endcmap\n", cstr, re.DOTALL)
if rr == None: return None
result = {}
cstr = rr.group(1)
for entry in cstr.split("\n"):
rr = re.match("\\s*<([0-9a-fA-F]+)>\\s+<([0-9a-fA-F]+)>\\s*", entry)
if rr == None: continue
result[int(rr.group(1), base=16)] = unichr(int(rr.group(2), base=16))
return result


class PageObject(DictionaryObject):
"""
Expand Down Expand Up @@ -2368,30 +2380,45 @@ def extractText(self):
content = self["/Contents"].getObject()
if not isinstance(content, ContentStream):
content = ContentStream(content, self.pdf)
# Note: we check all strings are TextStringObjects. ByteStringObjects
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.

cmap = None
cmaps = {}
firstParagraph = True
# Concatenate TextStringObjects and try to translate ByteStringObjects
# when we have a CMap, when we don't, then byte->string encoding is unknown,
# so adding them to the text here would be gibberish.
def translate(text):
if isinstance(text, TextStringObject):
return text
if isinstance(text, ByteStringObject) and cmap != None:
newText = ""
for c in text:
newText += cmap.get(ord_(c),"?")
return newText
return ""

for operands, operator in content.operations:
if operator == b_("Tj"):
_text = operands[0]
if isinstance(_text, TextStringObject):
text += _text
if operator == b_("Tf"):
try:
font = operands[0]
cmap = cmaps.get(font)
if (cmap == None):
cmap = parseCMap(str_(self["/Resources"]["/Font"][font]["/ToUnicode"].getData()))
cmaps[font] = cmap
except KeyError:
cmap = None
elif operator == b_("Tj"):
text += translate(operands[0])
elif operator == b_("T*"):
text += "\n"
elif operator == b_("'"):
text += "\n"
_text = operands[0]
if isinstance(_text, TextStringObject):
text += operands[0]
text += translate(operands[0])
elif operator == b_('"'):
_text = operands[2]
if isinstance(_text, TextStringObject):
text += "\n"
text += _text
text += translate(operands[2])
elif operator == b_("TJ"):
for i in operands[0]:
if isinstance(i, TextStringObject):
text += i
text += translate(i)
return text

mediaBox = createRectangleAccessor("/MediaBox", ())
Expand Down