forked from Git-Lit/git-lit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
alto.py
341 lines (313 loc) · 15.3 KB
/
alto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on Jan 20, 2016
@author: Tom Morris <[email protected]>
@license: Apache License 2.0
'''
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from array import array
from collections import Counter
from lxml import etree as ET
import re
import sys
PY3 = sys.version_info[0] == 3
# TODO These can be tagged semantically with visual attributes decided later
LOW_QUALITY_STYLE = '[maroon]#%s#'
MED_QUALITY_STYLE = '[grey]#%s#'
# FIXME: This interacts poorly with other processing. Turn off for now.
LOW_QUALITY_THRESHOLD = 0.0 # 0.45
MED_QUALITY_THRESHOLD = 0.0 # 0.65 # Should be higher but generates too much noise in source
# This only matches very basic signatures (lower right page marks)
SIGNATURE_REGEX = re.compile('^[0-9\-—].$')
class Alto(object):
'''
Class to read the ALTO XML format, as used by the British Library, for encoding OCR text.
http://www.bl.uk/schemas/alto/alto-1-4.xsd
Canonical ALTO info (current version is v3):
https://github.com/altoxml/schema
http://www.loc.gov/standards/alto/
Interesting fields:
/alto/Description/sourceImageInformation/fileName
/alto/Styles
TextStyle @ID FONTSIZE="10" FONTFAMILY="Courier New"
ParagraphStyle @ID @ALIGN=Center|Left
/alto/Layout/Page
Attributes: @ID @PHYSICAL_IMG_NR HEIGHT WIDTH POSITION=Cover/Left/Right/Single/Foldout QUALITY=OK ACCURACY=float% PC=1.0 (page confidence 0-1.0)
TopMargin @HPOS VPOS WIDTH HEIGHT
LeftMargin
RightMargin
BottonMargin
PrintSpace
ComposedBlock @TYPE=Illustration
GraphicalElement
TextBlock @STYLE_REFS="TXT_0 PAR_CENTER" (ids from Styles sections)
TextLine
String @HPOS VPOS WIDTH HEIGHT WC="0.68" CC="00459" CONTENT="Myword" SUBS_TYPE="HypPart1|HypPart2"
SP space
HYP hyphen
'''
WORD_CONFIDENCE_HISTOGRAM = 20
def __init__(self, xmlfile, continuation):
'''
Constructor
'''
self.xmlfile = xmlfile
self.word_count = 0
self.avg_word_confidence = None # 0 - 1.0
if PY3:
self.char_confidence = array(u'L',[0]*10) # 0=Good to 9=Bad
self.word_confidence = array(u'L',[0]*Alto.WORD_CONFIDENCE_HISTOGRAM)
else:
self.char_confidence = array(b'L',[0]*10) # 0=Good to 9=Bad
self.word_confidence = array(b'L',[0]*Alto.WORD_CONFIDENCE_HISTOGRAM)
self.hyphen1_count = 0
self.hyphen2_count = 0
self.text = ''
self.page_accuracy=[]
self.pages = 0
self.styles = Counter()
self.continuation = continuation
self.parse_file()
def parseTextBlock(self, block, pageStart, pageMargin):
"""
Parse the lines, words, spaces, hyphens in a single text block.
TODO: Do we need a parameter for paragraph indent, avg jitter, etc?
"""
# TODO: Setting the threshold is key. We should see a bimodal distribution
# with lots of values near the left margin, a number near the nominal indent, and few in the middle
# Perhaps pre-scan all TextBlocks/TextLines on page or in document?
PARA_INDENT_THRESHOLD = 25 # This value is for experimentation ONLY!
PARA_INDENT_THRESHOLD2 = 100
BLOCK_INDENT_THRESHOLD = PARA_INDENT_THRESHOLD # This value is for experimentation ONLY!
words = 0
confidence = 0
lines = []
if not pageStart and not self.continuation:
lines.extend(['',''])
lmargin = None
centered = False
if 'STYLEREFS' in block.attrib:
for s in block.attrib['STYLEREFS'].split():
self.styles[s] +=1
if s == 'PAR_CENTER':
centered = True
if 'HPOS' in block.attrib:
lmargin = int(block.attrib['HPOS'])
else:
raise Exception("Block with no HPOS, can't continue")
firstLine = False
# Anything centered is automatically a new paragraph to deal with chapter heads, etc.
# Ditto for indented text blocks since they could be blockquote, verse, etc.
# FIXME: centered/indented blocks continued from the previous page don't count
paraStart = centered or ((lmargin - pageMargin) > BLOCK_INDENT_THRESHOLD)
for tl in block:
if tl.tag == 'TextLine':
# Start with any hyphenated piece left over
if self.continuation:
lines.append(self.continuation.rstrip('-'))
self.continuation = None
elif paraStart:
lines.extend(['[verse]',''])
paraStart = False
else:
lines.append('')
if 'HPOS' in tl.attrib:
indent = int(tl.attrib['HPOS']) - lmargin
# TODO: Pages ending with a single line paragraph beginning can have them
# split into a separate block. Perhaps we should work off print area margin?
if indent > PARA_INDENT_THRESHOLD and indent < PARA_INDENT_THRESHOLD2:
# if firstLine:
# paraStart = True
lines.append('')
#print ' New paragraph!', indent
elif indent > PARA_INDENT_THRESHOLD/4 and indent <- PARA_INDENT_THRESHOLD:
print(' **Indent in the middle ', indent)
#print ' continuation: ', indent
elif indent > PARA_INDENT_THRESHOLD2:
#print(' ** Unexpected indent -- too far right: ', indent)
pass
else:
raise Exception('Something bad happened - no HPOS in TextLine - aborting')
for elem in tl:
if elem.tag == 'String': # <String> element is just a single word
words += 1
wc = float(elem.attrib['WC'])
confidence += wc
# Tag low quality words visually for ASCIIDOC
# TODO: Coalesce runs of same attributes
s = elem.attrib['CONTENT']
# if pageStart and s.find('CHAPTER') >= 0:
# print 'Chapter head', centered, lmargin, indent
if wc < LOW_QUALITY_THRESHOLD:
lines[-1] += (LOW_QUALITY_STYLE % s)
elif wc < MED_QUALITY_THRESHOLD:
lines[-1] += (MED_QUALITY_STYLE % s)
else:
lines[-1] += s
# Update word * character confidence histogram counts
self.word_confidence[int(wc*100/Alto.WORD_CONFIDENCE_HISTOGRAM)] += 1
for c in elem.attrib['CC']:
self.char_confidence[int(c)] += 1
# Tally counts for hyphenation pieces
if 'SUBS_TYPE' in elem.attrib:
hy = elem.attrib['SUBS_TYPE']
if hy == 'HypPart1':
self.hyphen1_count += 1
elif hy == 'HypPart2': # These are sometimes missing
self.hyphen2_count += 1
else:
print('Unrecognized SUBS_TYPE ' + hy, file=sys.stderr)
elif elem.tag == 'SP':
lines[-1] += ' '
elif elem.tag == 'HYP':
pass
else:
print('Unknown tag ' + elem.tag, file=sys.stderr)
# End of block hyphenation not handled correctly by OCR
if lines[-1][-1] == '-' or tl[-1].tag == 'HYP':
w = lines[-1].split(' ')
lines[-1] = ' '.join(w[0:-1])
self.continuation = w[-1]
firstLine = False
# Postprocess text
# TODO: Escape ASCIIDOC syntax: ''', <<<, leading space on line, list markers
for (i,l) in enumerate(lines):
if len(l) > 1:
if l[0] == '=' or l[0] == '.': # Heading & block title markers
l = '{empty}'+l
elif l[1] == '.': # list marker
# Lines starting M. Girardeu get interpreted as lists
l = '{empty}' + l # Escape to prevent bad list processing
elif l[0:2] == '" ':
# Open quote followed by extraneous space
# This happens more than just at beginning of line, but it's the most common case
# (and the others are more ambiguous and need more sophistication to repair
l = '\\"' + l[2:]
# directional quotes are never ambiguous - clean them all up
l = l.replace(u'“ ',u'\\“').replace(u' ”',u'\\”')
# Escape bold/emphasis/monospace markers
l = l.replace('*','\\*').replace('_','\\_').replace('+','\\+')
# Extra space before punctuation is not uncommon
l = l.replace(' ;',';').replace(' ,',',').replace(' .','')
l = l.strip() # Leading (especially) and trailing whitespace is problematic
lines[i] = l
# Move signature marks out of line to a footnote (need better markup)
# TODO: handle catchwords too, if present/common in the corpus
if len(lines) > 0 and SIGNATURE_REGEX.match(lines[-1]):
lines[-1] = 'footnote:[Possible signature: "%s"]' % lines[-1]
# TODO: Fix this crude chapter detector - chapter head can be in multiple blocks, among other things
# TODO: doesn't handle mid-page chapter heads like doc 000000206
if pageStart and lines[0].find('CHAPTER') >= 0:
head = lines[0]
# Concatenate all upper case lines
for i in range(1,len(lines)):
if not lines[i].isupper:
break
head += ' ' + lines.pop(i)
lines[0] = head
lines.insert(1, '-'*len(head))
lines.insert(2, '')
lines.insert(0,'') # Make sure we have a blank line before
return (words, confidence, '\n'.join(lines))
def parse_file(self):
confidence = 0
words = 0
# TODO: Count pages, extract <Page @ACCURACY>
# TODO: Analyze <TextBlock @STYLEREFS @ROTATION
# TODO: Analyze <TextLine
context = ET.iterparse(self.xmlfile, tag='Page')
for event, page in context: # @UnusedVariable
self.pages += 1
if 'ACCURACY' in page.attrib:
self.page_accuracy.append(float(page.attrib['ACCURACY']))
leaf = page.attrib['PHYSICAL_IMG_NR']
pageno=''
if 'PRINTED_IMG_NR' in page.attrib:
pageno = ', Page: %s' % page.attrib['PRINTED_IMG_NR']
self.text += '\n// Leaf %s' % leaf + pageno + '\n'
pageStart = True
for ps in page:
# Note: Body text can also live in the margins TopMargin, BottomMargin, etc
# if the layout analysis messes up, although normally they only contain
# header/footer text
if ps.tag == 'PrintSpace':
pageMargin = int(ps.attrib['HPOS'])
# TOOD: check for indented text blocks (block quote, etc)
for tb in ps:
if tb.tag == 'TextBlock':
(w, c, t) = self.parseTextBlock(tb, pageStart, pageMargin)
words += w
confidence += c
self.text += t
elif tb.tag == 'ComposedBlock':
# TODO: Can this be anything other than a picture?
self.text += ('\n// ComposedBlock (picture?) skipped here %s\n' % tb.attrib['ID'])
else:
print('Unknown tag in <PrintSpace> ' + tb.tag, file=sys.stderr)
pageStart = False
elif ps.tag in ['TopMargin', 'LeftMargin', 'RightMargin','BottomMargin']:
pass
else:
print('Unknown tag on <Page> ', ps.tag)
page.clear() # Clear the page now that we're done with it
if words:
self.avg_word_confidence = confidence / words
self.word_count = words
if self.pages > 1:
# TODO: We kind of assumge one page per file now because that's the BL use case
print('WARNING: Multi-page file: %d pages' % self.pages)
def test():
# Run through a whole bunch of pages
files = [
'data/000000037_0_1-42pgs__944211_dat.zip',
'data/000000196_0_1-164pgs__1031646_dat.zip',
'data/000000206_0_1-256pgs__594984_dat.zip',
'data/000000216_1_1-318pgs__632698_dat.zip',
'data/000000216_2_1-286pgs__638718_dat.zip',
'data/000000218_1_1-324pgs__630262_dat.zip',
'data/000000218_2_1-330pgs__630265_dat.zip',
'data/000000218_3_1-306pgs__634403_dat.zip',
'data/000000428_0_1-206pgs__1025980_dat.zip',
'data/000000472_0_1-178pgs__999442_dat.zip',
]
from zipfile import ZipFile
print(' File', 'Words', 'Word Confidence (0-1.0)', 'CharCount', 'Styles')
for f in files:
words = 0
confidence = 0
text = ''
cc = array('L',[0]*10)
styles = Counter()
with ZipFile(f) as zf:
continuation = None
for name in zf.namelist():
if name.startswith('ALTO/0'):
with zf.open(name) as entry:
a = Alto(entry, continuation)
if a.word_count:
text += a.text
words += a.word_count
for i in range(10):
cc[i] += a.char_confidence[i]
confidence += (a.avg_word_confidence * a.word_count)
styles.update(a.styles)
if a.hyphen1_count != a.hyphen2_count:
print('Mismatched hyphenation count ', name, a.hyphen1_count, a.hyphen2_count, file=sys.stderr)
# print name,a.word_count, a.page_accuracy, a.avg_word_confidence, a.char_confidence
if a.avg_word_confidence and abs(a.avg_word_confidence * 100.0 - a.page_accuracy[0]) > 2.0: # epsilon = 2%
print('Inaccurate page accuracy %2.2f %2.2f' % (a.page_accuracy[0], a.avg_word_confidence), file=sys.stderr)
continuation = a.continuation
else:
#print ' Skipped', name
pass
#print text
print(f.split('/')[1], words, confidence/words, len(text), styles.most_common())
# tot = sum(cc)
# for i in range(10):
# print i, '*'*(cc[i]*100/tot)
if __name__ == '__main__':
test()