From ba74b3e4591bce64f50e3f97c2dff89ed9daf3bd Mon Sep 17 00:00:00 2001 From: JKamlah Date: Fri, 26 Jul 2019 11:53:46 +0200 Subject: [PATCH 01/15] ADD script to create a simplified version of hocr-files. --- hocr-simplify | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 hocr-simplify diff --git a/hocr-simplify b/hocr-simplify new file mode 100755 index 0000000..56acc4a --- /dev/null +++ b/hocr-simplify @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +# change level of typesetting and/or remove properties to create a simplified hocr-version + +from __future__ import print_function +import argparse +import re +import sys +import os + +from lxml import etree, html + +parser = argparse.ArgumentParser( + description=('change level of typesetting and/or' + 'remove properties to create a simplified hocr-version') +) +properties = ['baseline','bbox','cflow','cuts','hardbreak','image','imagemd5','lpageno','ppageno','nlp','order','poly','scan_res','textangle','x_booxes','x_font','x_fsize','x_confs','x_scanner','x_source','x_wconf'] +parser.add_argument('file', nargs='?', default=sys.stdin) +parser.add_argument('-t','--typesetting', type=str, choices=['glyph','word','line','par','carea','page'], help='Maximum level of typesetting') +parser.add_argument('-r','--remove-properties', nargs='+', help='List of properties: {}'.format(', '.join(properties))) +parser.add_argument('fileout', nargs='?', help="Outputpath, default: print to terminal") +parser.add_argument('-v', '--verbose', action='store_true', help='Verbose, default: %(default)s') + +args = parser.parse_args() + +doc = html.parse(args.file) +# change level of typesetting +if args.typesetting: + # set maximum level of typesetting + if args.typesetting in ["word"]: + args.typesetting = "ocrx_"+args.typesetting + else: + args.typesetting = "ocr_"+args.typesetting + + # apply new level of typesetting + for node in doc.xpath("//*[@class='{}']".format(args.typesetting)): + if args.verbose: + print(re.sub(r'\s+', '\x20', node.text_content()).strip()) + node.text = node.text_content().strip() + for child in list(node): + node.remove(child) + +# remove properties +if args.remove_properties: + for node in doc.xpath("//*[@title]"): + title = node.get("title") + for prop in title.split(";"): + (key, args) = prop.strip().split(None, 1) + if key in args.remove_properties: + if args.verbose: + print("Replaced :{}".format(title)) + title=title.replace(prop+";","").strip() + +# if no outputpath is given, print to terminal +if args.fileout is None: + print(etree.tostring(doc, pretty_print=True).decode('UTF-8')) +else: + # create output path if needed + if not os.path.isdir(os.path.dirname(args.fileout)): + os.makedirs(os.path.dirname(args.fileout)) + + # write new hocr-files + with open(args.fileout, "w") as f: + f.writelines(etree.tostring(doc, pretty_print=True).decode('UTF-8')) From 7385e5ad9c79d2a566ad3f4a2c88cdc5fc7fbf26 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Fri, 26 Jul 2019 12:33:43 +0200 Subject: [PATCH 02/15] Refactored code. --- hocr-simplify | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/hocr-simplify b/hocr-simplify index 56acc4a..3f51147 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -11,13 +11,16 @@ import os from lxml import etree, html parser = argparse.ArgumentParser( - description=('change level of typesetting and/or' - 'remove properties to create a simplified hocr-version') + description=('change level of typesetting and/or' + 'remove properties to create a simplified hocr-version') ) -properties = ['baseline','bbox','cflow','cuts','hardbreak','image','imagemd5','lpageno','ppageno','nlp','order','poly','scan_res','textangle','x_booxes','x_font','x_fsize','x_confs','x_scanner','x_source','x_wconf'] +properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', 'imagemd5', 'lpageno', 'ppageno', 'nlp', + 'order', 'poly', 'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize', 'x_confs', 'x_scanner', + 'x_source', 'x_wconf'] parser.add_argument('file', nargs='?', default=sys.stdin) -parser.add_argument('-t','--typesetting', type=str, choices=['glyph','word','line','par','carea','page'], help='Maximum level of typesetting') -parser.add_argument('-r','--remove-properties', nargs='+', help='List of properties: {}'.format(', '.join(properties))) +parser.add_argument('-t', '--typesetting', type=str, choices=['glyph', 'word', 'line', 'par', 'carea', 'page'], + help='Maximum level of typesetting') +parser.add_argument('-r', '--remove-properties', nargs='+', help='List of properties: {}'.format(', '.join(properties))) parser.add_argument('fileout', nargs='?', help="Outputpath, default: print to terminal") parser.add_argument('-v', '--verbose', action='store_true', help='Verbose, default: %(default)s') @@ -28,9 +31,9 @@ doc = html.parse(args.file) if args.typesetting: # set maximum level of typesetting if args.typesetting in ["word"]: - args.typesetting = "ocrx_"+args.typesetting + args.typesetting = "ocrx_" + args.typesetting else: - args.typesetting = "ocr_"+args.typesetting + args.typesetting = "ocr_" + args.typesetting # apply new level of typesetting for node in doc.xpath("//*[@class='{}']".format(args.typesetting)): @@ -38,7 +41,7 @@ if args.typesetting: print(re.sub(r'\s+', '\x20', node.text_content()).strip()) node.text = node.text_content().strip() for child in list(node): - node.remove(child) + node.remove(child) # remove properties if args.remove_properties: @@ -49,7 +52,7 @@ if args.remove_properties: if key in args.remove_properties: if args.verbose: print("Replaced :{}".format(title)) - title=title.replace(prop+";","").strip() + title = title.replace(prop + ";", "").strip() # if no outputpath is given, print to terminal if args.fileout is None: From 4f0a27198d35cf10f9081e2725f894e50bec02e2 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Fri, 26 Jul 2019 12:41:21 +0200 Subject: [PATCH 03/15] Style fixes. --- hocr-simplify | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/hocr-simplify b/hocr-simplify index 3f51147..fd2022c 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -1,6 +1,7 @@ #!/usr/bin/env python -# change level of typesetting and/or remove properties to create a simplified hocr-version +# change level of typesetting and/or remove properties +# to create a simplified hocr-version from __future__ import print_function import argparse @@ -12,17 +13,23 @@ from lxml import etree, html parser = argparse.ArgumentParser( description=('change level of typesetting and/or' - 'remove properties to create a simplified hocr-version') -) -properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', 'imagemd5', 'lpageno', 'ppageno', 'nlp', - 'order', 'poly', 'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize', 'x_confs', 'x_scanner', - 'x_source', 'x_wconf'] + 'remove properties to create' + 'a simplified hocr-version')) +properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', + 'imagemd5', 'lpageno', 'ppageno', 'nlp','order', 'poly', + 'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize', + 'x_confs', 'x_scanner', 'x_source', 'x_wconf'] + parser.add_argument('file', nargs='?', default=sys.stdin) -parser.add_argument('-t', '--typesetting', type=str, choices=['glyph', 'word', 'line', 'par', 'carea', 'page'], +parser.add_argument('-t', '--typesetting', type=str, + choices=['glyph', 'word', 'line', 'par', 'carea', 'page'], help='Maximum level of typesetting') -parser.add_argument('-r', '--remove-properties', nargs='+', help='List of properties: {}'.format(', '.join(properties))) -parser.add_argument('fileout', nargs='?', help="Outputpath, default: print to terminal") -parser.add_argument('-v', '--verbose', action='store_true', help='Verbose, default: %(default)s') +parser.add_argument('-r', '--remove-properties', nargs='+', + help='List of properties: {}'.format(','.join(properties))) +parser.add_argument('fileout', nargs='?', + help="Outputpath, default: print to terminal") +parser.add_argument('-v', '--verbose', + action='store_true', help='Verbose, default: %(default)s') args = parser.parse_args() @@ -58,7 +65,7 @@ if args.remove_properties: if args.fileout is None: print(etree.tostring(doc, pretty_print=True).decode('UTF-8')) else: - # create output path if needed + # create output path if needed if not os.path.isdir(os.path.dirname(args.fileout)): os.makedirs(os.path.dirname(args.fileout)) From 9160877aee182190bb1cdc9e4f2f22f20cc1b48c Mon Sep 17 00:00:00 2001 From: JKamlah Date: Fri, 26 Jul 2019 12:49:59 +0200 Subject: [PATCH 04/15] Added ws and removed another. --- hocr-simplify | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hocr-simplify b/hocr-simplify index fd2022c..0cc8094 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -13,10 +13,10 @@ from lxml import etree, html parser = argparse.ArgumentParser( description=('change level of typesetting and/or' - 'remove properties to create' + 'remove properties to create' 'a simplified hocr-version')) properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', - 'imagemd5', 'lpageno', 'ppageno', 'nlp','order', 'poly', + 'imagemd5', 'lpageno', 'ppageno', 'nlp', 'order', 'poly', 'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize', 'x_confs', 'x_scanner', 'x_source', 'x_wconf'] From 50f48552ff40ff14a4fc549bd60d7a23b304165a Mon Sep 17 00:00:00 2001 From: JKamlah Date: Fri, 26 Jul 2019 13:30:42 +0200 Subject: [PATCH 05/15] ADD test case for hocr-simplify --- test/hocr-simplify/hocr-simplify.tsht | 11 +++++++++++ test/smoke.tsht | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 test/hocr-simplify/hocr-simplify.tsht diff --git a/test/hocr-simplify/hocr-simplify.tsht b/test/hocr-simplify/hocr-simplify.tsht new file mode 100644 index 0000000..7987a09 --- /dev/null +++ b/test/hocr-simplify/hocr-simplify.tsht @@ -0,0 +1,11 @@ +#!/usr/bin/env tsht +TESTDATA="../testdata" +SIMPLEFILE="./tess.simple.hocr" + +plan 5 + +after () { + rm -f "$SIMPLEFILE" +} +hocr-simplify "$TESTDATA/tess.hocr" -t page > "$SIMPLEFILE" || fail 'hocr-simplify' +equals 3870 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 3870' diff --git a/test/smoke.tsht b/test/smoke.tsht index ba7bdeb..8659de5 100644 --- a/test/smoke.tsht +++ b/test/smoke.tsht @@ -1,6 +1,6 @@ #!/usr/bin/env tsht -for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split;do +for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split simplify;do exec_ok "hocr-$f" "--help" exec_ok "hocr-$f" "-h" done From e264c2fe1903e7e3c24e1f41a189197e6feaeb52 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Mon, 5 Aug 2019 15:44:24 +0200 Subject: [PATCH 06/15] FIX remove properties, ADD meta information correction, ADD remove choices (only for tesseract output atm), ADD remove attributes, e.g. id, title. --- hocr-simplify | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/hocr-simplify b/hocr-simplify index 0cc8094..0807ca2 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -24,16 +24,21 @@ parser.add_argument('file', nargs='?', default=sys.stdin) parser.add_argument('-t', '--typesetting', type=str, choices=['glyph', 'word', 'line', 'par', 'carea', 'page'], help='Maximum level of typesetting') -parser.add_argument('-r', '--remove-properties', nargs='+', +parser.add_argument('-a', '--remove-attributes', nargs='+', + help='Removes attributes, e.g. id') +parser.add_argument('-p', '--remove-properties', nargs='+', help='List of properties: {}'.format(','.join(properties))) +parser.add_argument('-c', '--remove-choices', action='store_true', + help='Removes alternatives (only for tesseract outputs)') parser.add_argument('fileout', nargs='?', - help="Outputpath, default: print to terminal") + help="Output path, default: print to terminal") parser.add_argument('-v', '--verbose', action='store_true', help='Verbose, default: %(default)s') args = parser.parse_args() doc = html.parse(args.file) + # change level of typesetting if args.typesetting: # set maximum level of typesetting @@ -42,11 +47,22 @@ if args.typesetting: else: args.typesetting = "ocr_" + args.typesetting + # update meta content + for node in doc.xpath("//*[@name='ocr-capabilities']"): + content = node.get("content") + if args.typesetting in content: + node.set("content", content.split(args.typesetting)[0] + args.typesetting) + if args.verbose: + print(node.get("content")) + # apply new level of typesetting for node in doc.xpath("//*[@class='{}']".format(args.typesetting)): if args.verbose: print(re.sub(r'\s+', '\x20', node.text_content()).strip()) - node.text = node.text_content().strip() + if args.remove_choices or "glyph" in args.typesetting: + node.text = node.text_content().split(" ")[0].strip() + else: + node.text = node.text_content().strip() for child in list(node): node.remove(child) @@ -54,21 +70,28 @@ if args.typesetting: if args.remove_properties: for node in doc.xpath("//*[@title]"): title = node.get("title") - for prop in title.split(";"): - (key, args) = prop.strip().split(None, 1) - if key in args.remove_properties: - if args.verbose: - print("Replaced :{}".format(title)) - title = title.replace(prop + ";", "").strip() + node.set('title', ';'.join([prop.replace("\"","'") for prop in title.split(";") if prop.strip().split(None, 1)[0] not in args.remove_properties])) + if args.verbose: + print("Replaced :{}".format(title)) +else: + # Replace double quotation marks with single + for node in doc.xpath("//*[@title]"): + node.set("title",node.get("title").replace("\"","'")) + +# remove attributes +if args.remove_attributes: + for attr in args.remove_attributes: + for node in doc.xpath(f"//*[@{attr}]"): + node.attrib.pop(f"{attr}") -# if no outputpath is given, print to terminal +# if no output path is given, print to terminal if args.fileout is None: - print(etree.tostring(doc, pretty_print=True).decode('UTF-8')) + print(etree.tostring(doc, pretty_print=True,encoding=str)) else: # create output path if needed if not os.path.isdir(os.path.dirname(args.fileout)): os.makedirs(os.path.dirname(args.fileout)) - # write new hocr-files + # write new hocr file with open(args.fileout, "w") as f: - f.writelines(etree.tostring(doc, pretty_print=True).decode('UTF-8')) + f.writelines(etree.tostring(doc, pretty_print=True,encoding=str)) From be4bb771f2bdf0852bbf26f1fc9ddf482375fe59 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Tue, 6 Aug 2019 14:52:43 +0200 Subject: [PATCH 07/15] FIX char encoding, ADD remove-empty-contents which removes empty contents or only containing whitespaces,. --- hocr-simplify | 23 +++++++++++++---------- test/testdata/kraken.hocr | 0 2 files changed, 13 insertions(+), 10 deletions(-) create mode 100644 test/testdata/kraken.hocr diff --git a/hocr-simplify b/hocr-simplify index 0807ca2..29b53cb 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -15,10 +15,10 @@ parser = argparse.ArgumentParser( description=('change level of typesetting and/or' 'remove properties to create' 'a simplified hocr-version')) -properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', +properties = {'baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', 'imagemd5', 'lpageno', 'ppageno', 'nlp', 'order', 'poly', 'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize', - 'x_confs', 'x_scanner', 'x_source', 'x_wconf'] + 'x_confs', 'x_scanner', 'x_source', 'x_wconf'} parser.add_argument('file', nargs='?', default=sys.stdin) parser.add_argument('-t', '--typesetting', type=str, @@ -26,10 +26,10 @@ parser.add_argument('-t', '--typesetting', type=str, help='Maximum level of typesetting') parser.add_argument('-a', '--remove-attributes', nargs='+', help='Removes attributes, e.g. id') +parser.add_argument('-e', '--remove-empty-contents', action='store_true', + help='Removes contents which are empty or contains whitespaces only') parser.add_argument('-p', '--remove-properties', nargs='+', help='List of properties: {}'.format(','.join(properties))) -parser.add_argument('-c', '--remove-choices', action='store_true', - help='Removes alternatives (only for tesseract outputs)') parser.add_argument('fileout', nargs='?', help="Output path, default: print to terminal") parser.add_argument('-v', '--verbose', @@ -37,7 +37,8 @@ parser.add_argument('-v', '--verbose', args = parser.parse_args() -doc = html.parse(args.file) +with open(args.file,"r",encoding="utf-8") as f: + doc = html.parse(f) # change level of typesetting if args.typesetting: @@ -50,6 +51,7 @@ if args.typesetting: # update meta content for node in doc.xpath("//*[@name='ocr-capabilities']"): content = node.get("content") + if content is None: continue if args.typesetting in content: node.set("content", content.split(args.typesetting)[0] + args.typesetting) if args.verbose: @@ -59,10 +61,11 @@ if args.typesetting: for node in doc.xpath("//*[@class='{}']".format(args.typesetting)): if args.verbose: print(re.sub(r'\s+', '\x20', node.text_content()).strip()) - if args.remove_choices or "glyph" in args.typesetting: - node.text = node.text_content().split(" ")[0].strip() - else: - node.text = node.text_content().strip() + text_content = node.text_content() + if args.remove_empty and text_content.strip() == "": + node.getparent().remove(node) + continue + node.text = "\n".join([text.strip() for text in text_content.splitlines() if text.strip() != ""]) for child in list(node): node.remove(child) @@ -93,5 +96,5 @@ else: os.makedirs(os.path.dirname(args.fileout)) # write new hocr file - with open(args.fileout, "w") as f: + with open(args.fileout, "w", encoding="utf-8") as f: f.writelines(etree.tostring(doc, pretty_print=True,encoding=str)) diff --git a/test/testdata/kraken.hocr b/test/testdata/kraken.hocr new file mode 100644 index 0000000..e69de29 From 4fb6a4c38f23a0fb6195720dbf88d52c1795bdb3 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Wed, 7 Aug 2019 10:53:05 +0200 Subject: [PATCH 08/15] ADD remove choices, which removes all lstm_choices (tesseract only),FIX typesetting format problems, REWORK string format 97-98. --- hocr-simplify | 60 ++++++++++++++++++++++---------------- test/testdata/ocropus.hocr | 0 2 files changed, 35 insertions(+), 25 deletions(-) create mode 100644 test/testdata/ocropus.hocr diff --git a/hocr-simplify b/hocr-simplify index 29b53cb..53dd4bf 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -15,17 +15,22 @@ parser = argparse.ArgumentParser( description=('change level of typesetting and/or' 'remove properties to create' 'a simplified hocr-version')) -properties = {'baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', + +properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image', 'imagemd5', 'lpageno', 'ppageno', 'nlp', 'order', 'poly', 'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize', - 'x_confs', 'x_scanner', 'x_source', 'x_wconf'} + 'x_confs', 'x_scanner', 'x_source', 'x_wconf'] + +typesettings = ['ocrx_word', 'ocr_line', 'ocr_par', 'ocr_carea', 'ocr_page'] parser.add_argument('file', nargs='?', default=sys.stdin) parser.add_argument('-t', '--typesetting', type=str, - choices=['glyph', 'word', 'line', 'par', 'carea', 'page'], - help='Maximum level of typesetting') + choices=typesettings, + help='List of typesetting: {}'.format(','.join(typesettings))) parser.add_argument('-a', '--remove-attributes', nargs='+', help='Removes attributes, e.g. id') +parser.add_argument('-c', '--remove-choices', action='store_true', + help='Removes alternatives (tesseract outputs only)') parser.add_argument('-e', '--remove-empty-contents', action='store_true', help='Removes contents which are empty or contains whitespaces only') parser.add_argument('-p', '--remove-properties', nargs='+', @@ -40,34 +45,39 @@ args = parser.parse_args() with open(args.file,"r",encoding="utf-8") as f: doc = html.parse(f) +# delete all nodes where the id-attribute contain lstm_choices +if args.remove_choices: + for node in doc.xpath('.//*[contains(@id,"lstm_choices")]'): + node.getparent().remove(node) + # change level of typesetting if args.typesetting: - # set maximum level of typesetting - if args.typesetting in ["word"]: - args.typesetting = "ocrx_" + args.typesetting - else: - args.typesetting = "ocr_" + args.typesetting - # update meta content - for node in doc.xpath("//*[@name='ocr-capabilities']"): + node = doc.find("//*[@name='ocr-capabilities']") + if node is not None: content = node.get("content") - if content is None: continue - if args.typesetting in content: + if content is not None and args.typesetting in content: node.set("content", content.split(args.typesetting)[0] + args.typesetting) if args.verbose: print(node.get("content")) # apply new level of typesetting - for node in doc.xpath("//*[@class='{}']".format(args.typesetting)): - if args.verbose: - print(re.sub(r'\s+', '\x20', node.text_content()).strip()) - text_content = node.text_content() - if args.remove_empty and text_content.strip() == "": - node.getparent().remove(node) - continue - node.text = "\n".join([text.strip() for text in text_content.splitlines() if text.strip() != ""]) - for child in list(node): - node.remove(child) + for typesetting in typesettings: + for node in doc.xpath("//*[@class='{}']".format(typesetting)): + if args.verbose and typesetting == args.typesetting: + print(re.sub(r'\s+', '\x20', node.text_content()).strip()) + text_content = node.text_content() + seperator = "\n" + if "word" in typesetting: + seperator = "" + elif "line" in typesetting: + seperator = " " + node.text = seperator.join([text.strip().replace("\n","") for text in text_content.splitlines() if + not text.strip() != "\n" and args.remove_empty_contents or text.strip() != ""]) + for child in list(node): + node.remove(child) + if typesetting == args.typesetting: + break # remove properties if args.remove_properties: @@ -84,8 +94,8 @@ else: # remove attributes if args.remove_attributes: for attr in args.remove_attributes: - for node in doc.xpath(f"//*[@{attr}]"): - node.attrib.pop(f"{attr}") + for node in doc.xpath("//*[@{}]".format(attr)): + node.attrib.pop("{}".format(attr)) # if no output path is given, print to terminal if args.fileout is None: diff --git a/test/testdata/ocropus.hocr b/test/testdata/ocropus.hocr new file mode 100644 index 0000000..e69de29 From 95fa53ffd8a61c24a5cb7a9cb60cffe4fbdc97d8 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Wed, 7 Aug 2019 10:55:34 +0200 Subject: [PATCH 09/15] FIX max char in line. --- hocr-simplify | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hocr-simplify b/hocr-simplify index 53dd4bf..2386550 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -72,8 +72,10 @@ if args.typesetting: seperator = "" elif "line" in typesetting: seperator = " " - node.text = seperator.join([text.strip().replace("\n","") for text in text_content.splitlines() if - not text.strip() != "\n" and args.remove_empty_contents or text.strip() != ""]) + node.text = seperator.join([text.strip().replace("\n","") for text in + text_content.splitlines() if + not text.strip() != "\n" and + args.remove_empty_contents or text.strip() != ""]) for child in list(node): node.remove(child) if typesetting == args.typesetting: @@ -83,7 +85,10 @@ if args.typesetting: if args.remove_properties: for node in doc.xpath("//*[@title]"): title = node.get("title") - node.set('title', ';'.join([prop.replace("\"","'") for prop in title.split(";") if prop.strip().split(None, 1)[0] not in args.remove_properties])) + node.set('title', ';'.join([prop.replace("\"","'") for prop in + title.split(";") if + prop.strip().split(None, 1)[0] not in + args.remove_properties])) if args.verbose: print("Replaced :{}".format(title)) else: From 009d74636fff41d8282118b687adf43b3cbfb102 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Wed, 7 Aug 2019 11:09:23 +0200 Subject: [PATCH 10/15] REWORK help messages and comments. --- hocr-simplify | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/hocr-simplify b/hocr-simplify index 2386550..3598c82 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -1,7 +1,12 @@ #!/usr/bin/env python -# change level of typesetting and/or remove properties -# to create a simplified hocr-version +# Create a simplipfied hocr-version by: +# change level of typesetting +# remove properties +# remove attributes +# remove empty contents +# remove character alternatives (choices) + from __future__ import print_function import argparse @@ -26,11 +31,12 @@ typesettings = ['ocrx_word', 'ocr_line', 'ocr_par', 'ocr_carea', 'ocr_page'] parser.add_argument('file', nargs='?', default=sys.stdin) parser.add_argument('-t', '--typesetting', type=str, choices=typesettings, - help='List of typesetting: {}'.format(','.join(typesettings))) + help='Sets a new minimum typesetting level.\n' + 'List of typesetting: {}'.format(','.join(typesettings))) parser.add_argument('-a', '--remove-attributes', nargs='+', help='Removes attributes, e.g. id') parser.add_argument('-c', '--remove-choices', action='store_true', - help='Removes alternatives (tesseract outputs only)') + help='Removes character alternatives (tesseract outputs only)') parser.add_argument('-e', '--remove-empty-contents', action='store_true', help='Removes contents which are empty or contains whitespaces only') parser.add_argument('-p', '--remove-properties', nargs='+', From a7620232a48a399d351d03289a59dfa9e6df9c3b Mon Sep 17 00:00:00 2001 From: JKamlah Date: Wed, 7 Aug 2019 13:26:06 +0200 Subject: [PATCH 11/15] FIX encoding read and write. --- hocr-simplify | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hocr-simplify b/hocr-simplify index 3598c82..582b72f 100755 --- a/hocr-simplify +++ b/hocr-simplify @@ -11,8 +11,9 @@ from __future__ import print_function import argparse import re -import sys import os +from io import open +import sys from lxml import etree, html @@ -110,12 +111,16 @@ if args.remove_attributes: # if no output path is given, print to terminal if args.fileout is None: - print(etree.tostring(doc, pretty_print=True,encoding=str)) + encoding = "utf-8" + if sys.version_info[0] > 2: + encoding = str + print(etree.tostring(doc, pretty_print=True,encoding=encoding)) + else: # create output path if needed if not os.path.isdir(os.path.dirname(args.fileout)): os.makedirs(os.path.dirname(args.fileout)) # write new hocr file - with open(args.fileout, "w", encoding="utf-8") as f: - f.writelines(etree.tostring(doc, pretty_print=True,encoding=str)) + with open(args.fileout, "wb") as f: + f.write(etree.tostring(doc, pretty_print=True,encoding="utf-8")) From 4c44dea5679af96b4282a048928161f37bc8ed82 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Wed, 7 Aug 2019 13:28:48 +0200 Subject: [PATCH 12/15] ADD new tests and testfiles. --- test/hocr-simplify/hocr-simplify.tsht | 12 +- test/testdata/tess_choices.hocr | 10221 +++++++++++++++ test/testdata/tess_choices_charboxes.hocr | 13146 ++++++++++++++++++++ 3 files changed, 23376 insertions(+), 3 deletions(-) create mode 100644 test/testdata/tess_choices.hocr create mode 100644 test/testdata/tess_choices_charboxes.hocr diff --git a/test/hocr-simplify/hocr-simplify.tsht b/test/hocr-simplify/hocr-simplify.tsht index 7987a09..b7ff902 100644 --- a/test/hocr-simplify/hocr-simplify.tsht +++ b/test/hocr-simplify/hocr-simplify.tsht @@ -2,10 +2,16 @@ TESTDATA="../testdata" SIMPLEFILE="./tess.simple.hocr" -plan 5 +plan 3 after () { rm -f "$SIMPLEFILE" } -hocr-simplify "$TESTDATA/tess.hocr" -t page > "$SIMPLEFILE" || fail 'hocr-simplify' -equals 3870 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 3870' +hocr-simplify "$TESTDATA/tess.hocr" -t ocr_page > "$SIMPLEFILE" || fail 'hocr-simplify' +equals 3268 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 3268' + +hocr-simplify "$TESTDATA/tess_choices.hocr" -c -t ocr_line > "$SIMPLEFILE" || fail 'hocr-simplify' +equals 9691 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 9691' + +hocr-simplify "$TESTDATA/tess_choices_charboxes.hocr" -c -t ocrx_word > "$SIMPLEFILE" || fail 'hocr-simplify' +equals 58622 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 58622' diff --git a/test/testdata/tess_choices.hocr b/test/testdata/tess_choices.hocr new file mode 100644 index 0000000..f8abc60 --- /dev/null +++ b/test/testdata/tess_choices.hocr @@ -0,0 +1,10221 @@ + + + + + + + + + + +
+
+

+ + 1 + + 1 + l + I + ] + i + | + + Down + + + _ + . + - + + + 1 + + D + P + b + M + O + W + + o + + w + v + + n + m + h + » + + the + + + + t + + h + b + + e + + Rabbit-Hole + + + + R + + a + + b + h + + b + h + + i + + t + + - + + + H + + o + + l + + e + + +

+
+
+

+ + Alice + + A + M + h + d + l + ( + + l + + _ + ] + i + I + + i + l + t + I + j + a + + c + e + o + t + C + s + + e + c + g + s + é + a + + was + + + _ + ' + - + . + + + w + v + m + + a + e + + s + e + g + a + : + c + + beginning + + + _ + . + + b + h + o + D + d + l + + e + + g + s + y + + i + I + t + l + a + + n + m + h + o + u + + n + m + h + u + o + p + + i + I + l + + n + + g + + to + + + + t + c + f + e + + o + 0 + a + O + + get + + + _ + + g + + e + é + c + + t + f + r + + very + + + _ + ' + + v + w + y + - + c + + e + . + + r + + y + v + w + g + + tired + + + + t + + i + + r + v + + e + + d + + of + + + + o + + f + + + sitting + + + ' + + s + e + g + + i + + t + + t + i + s + b + ¢ + r + + i + + n + + g + e + s + , + z + ¢ + + by + + + _ + + b + + y + + her + + + + h + b + + e + + r + y + + sister + + + + s + + i + l + + s + e + g + + t + + e + + r + , + t + y + . + + on + + + + o + + n + m + + the + + + + t + c + + h + + e + c + + bank, + + + _ + + b + + a + + n + m + u + + k + + , + + . + + + + and + + a + g + 4 + 2 + . + e + + n + m + u + p + + i + + d + q + 4 + a + c + g + + of + + + _ + / + { + o + . + + o + 0 + e + a + O + + f + + t + i + + having + + + _ + + h + n + b + m + l + + a + e + + v + w + - + y + c + r + + i + l + + n + o + u + m + i + + g + + d + + nothing + + + g + + n + m + p + h + u + y + + o + a + e + + c + + t + c + r + f + + h + b + l + n + + i + I + + n + m + + g + + to + + + + t + f + c + + o + 0 + a + e + c + + do: + + + _ + / + + d + l + c + + o + a + 0 + n + u + O + + : + ; + . + - + + s + + once + + + _ + + o + a + + n + m + u + v + + c + e + ¢ + t + + e + . + c + s + + or + + + _ + + o + a + + r + ' + + twice + + + + t + + w + v + + i + I + l + + c + e + C + + e + c + + she + + + _ + + s + g + + h + + e + c + + had + + + + h + b + n + m + + a + + d + c + + peeped + + + _ + + p + y + e + n + + e + . + + e + a + c + + p + n + y + j + g + v + + e + + d + + into + + + + i + + n + m + u + h + + t + + o + + the + + + + t + + h + + e + + book + + + + b + d + h + + o + + o + + k + + her + + + + h + + e + + r + + + + sister + + s + S + e + g + c + a + + i + I + l + + t + 1 + + s + g + c + e + t + z + + t + c + v + i + w + + e + c + a + + r + . + t + + was + + + _ + + w + v + m + + a + e + + s + e + g + + reading, + + + + r + v + t + f + , + m + + e + + a + e + g + z + + d + c + q + l + + i + I + + n + m + u + h + + g + + , + . + + but + + + + b + o + h + D + + u + + t + r + + it + + + / + + i + l + I + t + + t + c + ¢ + s + e + - + + had + + + + h + n + b + H + + a + e + + d + c + + no + + + + n + m + p + v + u + + o + + pictures + + + + p + j + y + n + + i + I + + c + e + + t + + u + + r + + e + + s + + or + + + + o + + r + + conversations + + + + c + e + C + t + + o + + n + m + + v + + e + + r + + s + + a + + t + + i + + o + + n + m + + s + e + g + + in + + + + i + + n + m + + it, + + + + i + I + + t + + , + . + + ‘and + + + + + ' + + ( + ! + : + + a + e + + n + + d + g + + what + + + + w + v + + h + + a + + t + + is + + + + i + j + I + + s + + + + the + + t + c + - + r + s + e + + h + n + b + m + l + o + + e + c + é + s + a + o + + use + + + _ + . + - + / + ' + + u + i + a + w + y + + s + e + g + S + + e + c + + of + + + _ + + o + 0 + c + a + O + e + + f + t + , + + + a + + + _ + . + + a + 2 + 4 + e + d + c + + book,’ + + + + b + h + d + o + l + v + + o + + o + a + n + + k + + , + . + + + + + ? + ' + " + ® + + thought + + + + t + + h + + o + + u + n + + g + s + + h + b + + t + + Alice + + + _ + + A + + l + + i + l + I + + c + e + + e + c + + ‘without + + + + + + w + v + m + + i + I + l + + t + + h + + o + + u + + t + + pictures + + + + p + y + n + b + + i + + c + e + + t + + u + + r + + e + + s + g + e + + or + + + + o + c + a + e + O + + r + , + + conversation?’ + + + + c + e + s + o + C + g + + o + + n + + v + w + y + V + + e + + r + + s + g + + a + + t + + i + I + + o + + n + m + u + + ? + + + 7 + + + + ? + ' + + +

+ +

+ + So + + S + 5 + G + s + $ + 8 + + o + a + O + e + 0 + + + she + + + _ + ' + - + . + / + + s + e + g + c + S + r + + h + b + n + m + o + l + + e + c + é + a + o + s + + was + + + ' + + w + v + m + n + + a + . + + s + + s + e + c + : + + considering + + + _ + + c + e + C + o + g + t + + o + a + O + + n + m + u + p + h + o + + s + e + c + a + n + g + + i + I + j + l + + d + + e + + r + t + , + v + + i + l + + n + m + u + h + + g + + in + + + + i + l + + n + m + o + + her + + + + h + b + + e + + r + , + t + + own + + + + o + O + a + 0 + n + u + + w + v + m + + n + m + u + p + + mind + + + + m + n + + i + + n + m + h + + d + q + 4 + g + + (as + + + + ( + { + [ + + + a + e + + s + e + g + z + x + + well + + + + w + v + m + + e + + l + + l + ] + 1 + ! + + as + + + _ + + a + e + + s + g + e + + she + + + _ + + s + g + + h + + e + + could, + + + + c + e + + o + O + + u + + l + ] + + d + + , + . + + + for + + + + f + t + + o + + r + + the + + + + t + c + r + + h + + e + + hot + + + + h + b + n + l + + o + a + e + + t + + + + day + + d + c + g + q + l + i + + a + e + g + 4 + s + 2 + + y + v + w + g + p + r + + made + + + _ + - + . + ' + / + + m + n + w + M + o + + a + e + . + + o + g + + d + c + q + a + g + 4 + + e + é + . + c + g + E + + her + + + e + + h + b + l + + e + c + + r + t + y + , + s + e + + feel + + + + f + + t + + { + , + + e + c + é + . + + e + c + é + a + g + s + + l + ] + 1 + | + . + i + + very + + + _ + + v + w + y + + e + + r + + y + + sleepy + + + + s + S + e + + l + + e + + e + a + c + o + g + p + + p + y + n + g + + y + v + w + r + + and + + + + a + e + + n + u + + d + 4 + + stupid), + + + + s + g + e + S + + t + c + r + s + e + + u + y + + p + y + n + b + + i + l + I + j + t + + d + c + + ) + ] + + , + . + ; + + whether + + + + w + v + m + + h + + e + + t + + h + b + + e + + r + + the + + + + t + c + r + + h + + e + c + + pleasure + + + _ + + p + y + + l + + e + c + + a + e + + s + g + e + a + + u + + r + + e + + of + + + + o + c + + f + + making + + + + m + + a + + k + + i + I + + n + + g + + a + + + + a + + + + daisy-chain + + d + c + a + g + l + q + + a + g + s + e + 4 + i + + i + I + t + l + j + a + + s + g + z + e + a + r + + y + v + w + a + g + p + + - + ~ + + _ + . + + c + e + C + t + o + + h + b + n + m + + a + + i + l + t + a + + n + m + i + u + + would + + + + w + v + m + + o + + u + w + n + m + + l + + d + + be + + + + b + h + o + + e + é + + worth + + + + w + v + m + + o + a + + r + s + + t + r + + h + b + + the + + + + t + + h + + e + + trouble + + + + t + c + + r + + o + + u + w + v + + b + h + + l + + e + . + + of + + + + o + + f + t + + + getting + + + ' + _ + + g + z + e + s + v + ¢ + + e + + t + + t + i + r + s + + i + l + t + + n + m + + g + + up + + + + u + w + y + v + n + r + + p + y + + and + + + + a + + n + + d + + picking + + + + p + y + + i + I + l + + c + e + + k + + i + + n + + g + + the + + + + t + r + c + + h + + e + + daisies, + + + + d + q + + a + u + i + + i + I + a + + s + e + + i + I + + e + . + + s + g + e + z + + , + . + + + + when + + w + v + W + m + o + y + + h + b + l + k + n + H + + e + . + c + o + a + s + + n + m + h + o + u + v + + suddenly + + + ' + / + _ + + + s + g + e + S + c + a + + u + y + w + i + a + + d + c + l + q + e + + d + c + q + g + e + + e + é + s + c + a + g + + n + m + h + u + o + a + + l + i + + y + v + w + a + + a + + + _ + + a + 2 + 4 + e + z + d + + White + + + + W + V + N + Y + M + + h + b + + i + I + + t + r + c + + e + c + s + g + é + + Rabbit + + + + R + K + F + N + B + + a + + b + h + l + p + + b + h + l + D + + i + I + j + + t + + with + + + _ + ' + + w + v + m + + i + I + + t + c + r + + h + n + + pink + + + _ + + p + + i + l + + n + m + + k + + eyes + + + + e + c + + y + v + w + + e + + s + g + e + x + r + + ran + + + + r + ' + v + + a + + n + m + y + + close + + + _ + + c + e + C + + l + + o + + s + g + + e + c + é + g + + by + + + + b + h + o + l + p + + y + v + w + + her. + + + _ + + h + b + + e + + r + t + + . + , + + +

+ +

+ + There + + T + [ + Y + l + I + t + + h + l + b + k + n + o + + e + . + c + o + s + a + + r + t + ' + v + i + - + + e + c + é + s + g + a + + was + + + _ + ' + . + + w + v + m + n + W + y + + a + e + + s + e + g + c + a + z + + nothing + + + _ + + n + m + h + i + + o + e + a + c + + t + c + r + - + + h + b + + i + l + + t + + n + o + + g + e + s + y + c + + so + + + . + _ + + s + g + e + S + c + + o + c + a + O + s + e + + VERY + + + + V + W + M + Y + + E + L + F + B + + R + + Y + + remarkable + + + + r + + e + c + + m + + a + e + + r + + k + K + + a + s + + b + p + h + o + + D + + l + i + I + + e + . + c + g + + in + + + + i + I + l + t + + n + m + + that; + + + + t + c + r + + h + b + + a + e + + t + + ; + , + : + + } + s + + nor + + + + n + m + + o + + r + + did + + + + d + + i + l + + d + + Alice + + + + A + + l + + i + l + t + I + é + . + + c + e + o + + e + + think + + + + t + + h + + i + + n + + k + + it + + + + i + + t + + so + + + + s + g + S + + o + + + + VERY + + V + Y + W + U + M + v + + E + L + I + B + F + K + + R + E + B + F + P + K + + Y + V + T + + y + ' + + much + + + _ + i + / + + m + n + w + i + a + M + + u + w + a + U + i + q + + c + e + t + + + h + b + k + l + n + + out + + + + o + O + a + 0 + + u + w + v + o + + t + r + k + + of + + + _ + / + + o + 0 + + f + + t + i + + the + + + ' + + t + c + i + + h + n + b + m + + e + c + a + + way + + + _ + + w + v + + a + + y + v + + to + + + _ + + t + c + r + + o + a + 0 + e + + hear + + + + h + b + + e + c + + a + e + + r + , + + the + + + + t + c + + h + + e + c + + Rabbit + + + _ + + R + N + K + F + + a + e + + b + + b + h + D + p + l + o + + i + I + l + a + + t + e + r + + say + + + _ + ' + / + - + + + s + e + S + g + + a + + y + v + w + + to + + + + t + f + c + + o + + itself, + + + / + _ + + i + I + + t + + s + g + e + + e + c + é + a + s + + l + ] + | + I + 1 + . + + f + t + i + £ + + , + ; + . + y + j + r + + ‘Oh + + + + + + ( + ' + + \ + + O + 0 + + h + b + + dear! + + + + d + + e + + a + + r + + ! + ' + | + + Oh + + + _ + + O + Q + 0 + + h + b + l + n + + + + dear! + + d + c + q + g + a + l + + e + c + g + s + é + . + + a + e + . + , + + s + + r + t + i + n + m + v + + ! + : + ' + | + / + + + I + + + _ + / + + I + T + [ + l + | + J + + shall + + + + s + g + e + S + c + + h + b + n + + a + e + + l + i + I + + l + ] + 1 + I + i + ! + + be + + + + b + o + h + v + + e + + late!’ + + + + l + I + i + L + + a + + t + + e + + ! + l + | + ' + ) + + + + + ' + ? + + (when + + + + ( + + [ + + + w + v + + h + + e + + n + m + + she + + + + s + + h + + e + c + + thought + + + + t + + h + + o + + u + + g + + h + + t + + it + + + + i + I + j + l + + t + + over + + + + o + + v + + e + + r + + afterwards, + + + + a + e + + f + t + + t + + e + + r + + w + + a + + r + + d + + s + + , + + it + + + + i + + t + + occurred + + + + o + + c + e + + c + e + o + + u + + r + + r + + e + + d + + to + + + + t + + o + a + + + + her + + h + n + m + b + H + l + + e + . + c + o + a + s + + r + t + ' + i + y + s + + that + + + _ + ' + + t + c + r + - + f + C + + h + b + n + m + H + + a + e + o + c + + t + r + + c + i + e + + she + + + _ + . + - + + s + e + g + c + S + + h + b + n + m + + e + c + + ought + + + _ + + o + c + a + 0 + n + O + + u + v + w + n + r + y + + g + s + + h + b + n + + t + r + + to + + + _ + + t + + o + + have + + + _ + + h + b + + a + e + + v + w + y + + e + c + + wondered + + + + w + v + m + + o + + n + m + + d + q + c + + e + c + . + é + + r + v + n + + e + c + . + + d + c + + at + + + + a + + t + i + k + + this, + + + + t + f + c + r + + h + b + + i + I + + s + e + g + a + z + + , + ; + + but + + + _ + + b + d + h + o + t + l + + u + + t + r + + at + + + _ + + a + e + + t + r + + the + + + + t + + h + + e + c + + time + + + + t + + i + a + + m + n + w + + e + . + + it + + + + i + I + j + + t + + all + + + + a + + l + 1 + + l + i + 1 + ] + + seemed + + + _ + + s + e + g + c + + e + c + + e + o + a + c + s + + m + n + w + + e + c + + d + + + + quite + + q + g + a + d + Q + m + + u + o + n + w + v + a + + i + I + l + a + t + é + + t + c + r + f + i + + e + c + s + é + g + + natural); + + + _ + + n + m + h + u + p + i + + a + e + + t + c + + u + o + w + + r + v + y + t + , + m + + a + e + . + , + + l + 1 + ] + + ) + } + ] + j + J + / + + ; + , + : + } + s + > + + but + + + _ + + b + d + p + + u + y + w + + t + + when + + + _ + + w + v + + h + + e + + n + + the + + + + t + c + r + + h + + e + c + + Rabbit + + + + R + + a + + b + h + + b + + i + I + + t + + actually + + + _ + + a + + c + e + + t + + u + w + v + + a + e + d + + l + + l + i + t + + y + Y + v + + TOOK + + + + T + Y + C + [ + t + + O + 0 + o + Q + © + + O + 0 + o + + K + X + + A + + + + A + L + 4 + + WATCH + + + + W + N + V + M + + A + + T + Y + I + + C + O + + H + + OUT + + + + O + Q + 0 + + U + + T + + OF + + + + O + 0 + + F + + + + ITS + + I + 1 + T + a + ! + [ + + T + t + Y + [ + f + I + + S + E + G + s + C + $ + + WAISTCOAT- + + + ' + _ + + W + V + N + M + Y + T + + A + + I + J + T + L + + S + G + E + + T + t + ' + Y + f + . + + C + G + c + O + S + T + + O + Q + 0 + + A + + T + Y + + - + _ + . + ~ + + , + + POCKET, + + + + P + F + B + + O + + C + + K + + E + + T + + , + + and + + + , + + a + + n + m + u + + d + + looked + + + + l + L + + o + + o + + k + + e + + d + + at + + + + a + + t + + it, + + + _ + + i + j + I + + t + + , + ; + + and + + + + a + + n + + d + + then + + + + t + c + r + + h + + e + + n + m + + hurried + + + + h + b + + u + + r + t + + r + t + n + + i + l + + e + c + + d + + on, + + + + o + 0 + + n + m + + , + . + + Alice + + + + A + + l + + i + + c + + e + + + + started + + s + e + c + S + g + x + + t + c + e + r + - + s + + a + e + g + o + c + . + + r + t + i + c + ' + + t + - + c + v + r + = + + e + c + a + + d + l + c + + to + + + _ + ' + + t + r + c + e + + o + a + + her + + + + h + b + l + + e + + r + + feet, + + + + f + t + + e + c + + e + c + a + s + é + o + + t + r + c + + , + . + ; + + + for + + + / + _ + + f + t + T + j + + o + a + e + + r + + it + + + + i + + t + c + - + s + r + e + + flashed + + + _ + + f + t + ( + + l + i + + a + e + s + g + + s + g + S + c + e + + h + n + b + m + + e + c + + d + + across + + + + a + e + s + + c + e + + r + t + + o + a + c + + s + g + e + c + + s + e + g + c + + her + + + _ + + h + b + + e + a + + r + , + t + y + . + + mind + + + + m + + i + + n + + d + + that + + + + t + c + r + + h + + a + e + + t + r + + she + + + _ + + s + e + g + + h + b + + e + c + a + + had + + + + h + + a + e + + d + l + + never + + + _ + + n + m + u + + e + + v + w + + e + . + + r + + before + + + + b + h + + e + + f + + o + + r + + e + + + + seen + + s + e + g + S + c + x + + e + c + o + s + g + - + + e + c + o + a + s + . + + n + m + u + i + y + + a + + + _ + + a + 2 + 4 + e + z + + rabbit + + + r + + r + , + + a + e + + b + + b + h + l + + i + I + + + t + e + i + + with + + + _ + ' + - + + w + + i + I + + t + r + c + + h + n + + either + + + + e + c + é + g + s + a + + i + l + j + I + t + f + + t + c + r + s + + h + + e + + r + + a + + + + a + 2 + 4 + e + « + o + + waistcoat-pocket, + + + + w + + a + + i + I + j + l + + s + e + + t + + c + e + o + ¢ + + o + + a + + t + + - + + ~ + . + _ + + p + y + n + b + v + + o + + c + e + + k + + e + + t + + , + . + + or + + + + o + c + + r + , + + a + + + + a + 2 + 4 + e + @ + « + + watch + + + + w + v + m + + a + + t + + c + e + o + ¢ + + h + + to + + + + t + c + f + e + + o + a + + take + + + + t + + a + + k + + e + + out + + + + o + + u + + t + + of + + + + o + + f + + it, + + + + i + + t + ¢ + + , + ; + . + y + + and + + + + a + + n + + d + g + + + + burning + + b + o + h + d + v + D + + u + o + w + a + . + i + + r + ' + , + i + m + y + + n + m + h + p + u + a + + i + I + l + t + + n + i + h + o + m + + g + y + . + + with + + + _ + / + ' + + w + v + W + m + + i + I + l + + t + c + + h + b + n + + curiosity, + + + _ + + c + e + C + o + + u + o + w + + r + t + y + , + s + . + + i + j + I + t + l + + o + c + a + n + + s + c + + i + + t + + y + + , + . + ; + + she + + + + s + g + + h + b + + e + c + + ran + + + + r + t + , + v + i + + a + e + o + + n + m + v + u + p + + across + + + _ + / + + a + e + s + + c + e + + r + y + + o + + s + g + e + + s + g + + the + + + + t + c + f + r + + h + + e + + field + + + + f + t + + i + a + + e + c + a + + l + + d + + after + + + + a + s + + f + + t + + e + + r + + it, + + + + i + I + t + l + : + + t + + , + ; + . + + and + + + + a + + n + + d + + fortunately + + + + f + + o + + r + t + + t + + u + y + w + + n + m + i + u + + a + e + + t + r + + e + + l + + y + v + w + r + + was + + + _ + + w + + a + + s + g + : + e + + + + just + + j + J + g + y + f + l + + u + w + y + n + a + t + + s + e + S + c + g + a + + t + c + r + e + . + « + + in + + + _ + ' + / + . + - + + i + a + u + I + + n + m + u + i + o + + time + + + n + _ + + t + c + C + r + f + + i + l + m + t + + m + n + w + a + + e + . + s + a + g + c + + to + + + + t + f + c + C + e + b + + o + O + a + 0 + e + + see + + + _ + ' + + s + S + + e + c + é + + e + c + é + g + s + ¢ + + it + + + _ + + i + l + I + t + + . + + t + f + i + + pop + + + + p + + o + a + e + n + + p + y + n + m + o + g + + down + + + + d + c + a + g + e + G + + o + e + + w + m + v + + n + m + + a + + + _ + . + + a + + large + + + + l + + a + + r + v + + g + + e + + rabbit-hole + + + + r + + a + e + + b + h + + b + h + + i + I + + t + + - + ~ + + h + b + n + + o + + l + + e + c + + under + + + + u + y + w + n + + n + m + u + + d + q + c + + e + + r + + the + + + + t + c + r + + h + + e + + hedge. + + + _ + + h + + e + + d + + g + z + + e + g + c + é + s + . + + . + , + - + + +

+ +

+ + In + + I + i + l + T + + 1 + + n + m + h + p + N + i + + another + + + _ + . + + a + e + z + g + s + d + + n + m + i + y + + o + c + e + a + s + 0 + + t + c + r + - + + h + n + + e + c + + r + , + t + . + c + y + + moment + + + + m + n + M + w + + o + a + e + O + + m + + e + + n + m + + t + r + k + s + + down + + + _ + + d + c + l + + o + + w + m + v + + n + m + h + + went + + + + w + v + m + + W + q + + e + . + + n + m + h + u + + t + + Alice + + + _ + + A + S + + l + + i + + c + + e + . + + after + + + + a + + f + t + { + + t + + e + + r + y + + it, + + + + i + l + I + + t + + , + ; + + never + + + + n + m + + e + + v + y + + e + + r + , + y + t + + once + + + + o + + n + + c + e + + e + c + + considering + + + + c + e + C + o + s + + o + + n + m + u + p + + s + e + + i + I + + d + + e + c + + r + + i + + n + + g + + how + + + _ + + h + + o + + w + v + + + + in + + i + l + I + + t + 1 + + n + m + u + h + o + i + + the + + + _ + ' + . + + t + c + r + - + i + s + + h + n + b + a + + e + c + . + é + o + + world + + + _ + + w + v + m + W + n + + o + a + + r + m + i + t + n + + + l + ] + + d + + she + + + + s + e + g + + h + b + + e + + was + + + + w + v + m + + a + + g + + s + c + g + + to + + + + t + + o + O + 0 + + get + + + + g + s + + e + + t + f + r + + out + + + + o + O + 0 + a + c + e + + u + + t + r + + again. + + + + a + e + + g + + a + g + + i + I + + n + m + p + h + + . + + +

+ +

+ + The + + T + I + l + [ + Y + t + + h + b + n + l + o + a + + e + c + é + a + o + s + + rabbit-hole + + + _ + / + - + ' + . + + r + t + v + i + , + m + + a + e + . + o + + b + h + o + d + l + v + + b + h + l + D + p + o + + i + I + + t + r + c + e + f + + - + ~ + = + + . + + + h + b + + o + e + c + a + 0 + + l + i + + e + c + s + + went + + + _ + + w + v + m + + e + + n + m + h + + t + r + k + c + + straight + + + _ + . + - + + s + e + + t + c + r + + r + v + i + + a + e + + i + l + t + I + . + + + g + z + s + + h + n + b + i + + t + r + + e + + on + + + _ + . + + o + 0 + + n + m + + like + + + + l + L + I + + i + + k + + e + c + + a + + + + a + e + o + 2 + « + 4 + + tunnel + + + + t + r + c + + u + + n + m + u + p + o + i + + n + m + + e + a + + l + ] + | + ) + + for + + + + f + + o + a + e + + r + , + + some + + + + s + g + e + c + S + + o + a + + m + w + + e + c + + way, + + + + w + + a + + y + + , + + and + + + + a + e + + n + m + + d + + then + + + + t + + h + + e + + n + m + + + + dipped + + d + c + q + a + e + g + + i + l + + t + I + ' + + p + y + n + g + b + o + + p + y + n + o + b + g + + e + c + a + o + . + + d + c + U + g + l + + suddenly + + + _ + - + + s + g + e + c + S + r + + u + y + w + n + + d + l + g + + d + c + e + + e + a + c + + n + m + h + u + i + + l + + y + v + w + p + + down, + + + + d + c + e + + o + + w + v + m + u + n + + n + m + h + y + u + » + + , + + y + . + + so + + + _ + + s + g + S + e + + o + 0 + a + + suddenly + + + + s + g + e + S + + u + i + y + + d + c + l + + d + c + q + + e + é + + n + m + h + + l + + y + v + w + + that + + + + t + c + + h + b + + a + e + + t + r + + Alice + + + _ + + A + + l + + i + l + t + + c + e + t + + e + c + g + s + + had + + + _ + + h + b + n + m + H + + a + e + + d + g + c + q + 0 + + not + + + + n + + o + c + e + a + + t + - + s + + a + + + _ + + a + e + 2 + s + d + + moment + + + + m + + o + a + + m + w + + e + . + + n + m + + t + + to + + + _ + + t + f + c + + o + + think + + + + t + c + + h + + i + l + + n + + k + + + + about + + a + e + g + d + s + 2 + + b + o + h + l + d + v + + o + e + a + c + u + O + + u + w + n + v + m + . + + t + r + c + s + + stopping + + + _ + ' + - + . + / + + s + g + e + S + + t + r + c + + o + a + e + c + + p + n + y + m + o + + p + n + y + g + + i + I + l + j + : + + n + + g + + herself + + + + h + b + + e + . + + r + n + v + y + + s + e + g + x + : + + e + c + é + g + + l + | + I + ] + i + ) + + f + t + + + before + + + _ + + b + h + d + l + t + v + + e + c + s + + f + t + + o + a + e + c + + r + ' + + e + c + + she + + + + s + g + e + c + + h + b + + e + c + + found + + + _ + + f + t + + o + a + + u + w + + n + m + + d + q + c + + herself + + + + h + + e + + r + + s + e + + e + + l + I + ) + i + + f + t + i + + falling + + + ' + + f + t + + a + + l + + l + i + + i + + n + u + + g + e + ¢ + s + c + z + + down + + + + d + c + G + + o + a + e + + w + v + m + + n + m + u + + a + + + + a + + very + + + + v + w + y + V + + e + + r + + y + v + w + + deep + + + + d + c + + e + c + + e + o + c + + p + n + y + j + + well. + + + + w + v + + e + + l + i + + l + + . + , + : + + + +

+ +

+ + Either + + E + K + H + F + R + B + + i + l + u + j + I + a + + t + c + r + + h + + e + + r + ' + t + + the + + + _ + + t + c + s + r + e + - + + h + b + + e + c + + well + + + _ + + w + v + m + W + + + e + . + + l + i + + l + ! + ] + 1 + I + | + + was + + + _ + + w + v + m + + a + + s + + very + + + + v + w + y + + e + + r + t + + y + v + w + + deep, + + + + d + c + + e + c + + e + g + . + + p + n + g + + , + . + + or + + + + o + + r + + she + + + + s + g + e + + h + + e + + fell + + + + f + + e + é + + l + 1 + + l + i + ! + 1 + + very + + + _ + + v + y + w + + e + + r + + y + v + + slowly, + + + + s + + l + + o + a + + w + + l + | + + y + v + w + + , + . + + for + + + + f + t + + o + + r + , + t + + she + + + + s + g + + h + + e + c + + had + + + + h + b + n + + a + + d + + plenty + + + + p + y + n + j + + l + i + + e + a + + n + m + + t + + y + + + + of + + o + c + e + O + a + 0 + + f + t + , + o + + + + time + + + _ + . + t + ' + , + + t + r + l + i + f + T + + i + l + t + m + + r + + m + w + n + a + i + + e + c + a + s + é + g + + as + + + _ + . + - + / + + a + e + 4 + g + s + . + + s + e + g + c + z + r + + she + + + s + _ + x + + s + g + e + c + S + a + + h + b + n + + e + c + + went + + + _ + . + + w + v + m + W + + e + c + + n + m + h + i + + t + r + i + s + e + c + + down + + + _ + . + : + + d + c + a + e + l + g + + o + a + + w + m + v + + n + m + h + y + » + o + + to + + + + t + f + r + + o + 0 + a + + look + + + + l + + o + c + a + e + + o + c + + k + g + + about + + + + a + e + g + s + . + + b + h + l + + o + + u + + t + r + + her + + + + h + b + l + n + m + + e + + r + , + + and + + + + a + e + 2 + + n + + d + + to + + + + t + c + - + + o + a + + wonder + + + + w + v + + o + + n + m + + d + + e + + r + + what + + + + w + v + + h + + a + e + + t + r + + was + + + _ + ' + + w + v + m + + a + + s + e + : + + going + + + _ + + g + s + z + e + ¢ + . + + o + c + e + + i + l + + n + m + + g + + + + to + + t + c + + + ¢ + i + - + + o + a + e + c + 0 + s + + happen + + + _ + . + , + + h + n + b + m + H + l + + a + e + o + c + g + + p + y + n + a + g + m + + p + y + n + o + w + + e + + n + m + h + u + o + p + + next. + + + _ + + n + m + h + p + y + v + + e + + x + z + r + n + + t + r + f + + . + , + - + + : + _ + + First, + + + _ + / + + F + P + K + E + T + + i + I + u + + r + + s + e + + t + c + r + e + s + + , + ; + . + + she + + + + s + g + e + c + S + + h + n + b + + e + + tried + + + + t + c + + r + + i + l + + e + c + + d + l + + to + + + + t + - + r + + o + a + e + c + 0 + g + + look + + + + l + + o + e + c + O + a + 0 + + o + c + a + O + + k + t + + down + + + _ + + d + c + a + g + G + J + + o + c + e + a + O + 0 + + w + + n + m + h + + and + + + + a + e + + n + + d + + make + + + + m + + a + + k + + e + + out + + + + o + + u + v + w + + t + r + + what + + + _ + + w + v + m + + h + + a + e + + t + + she + + + + s + g + + h + + e + + was + + + + w + v + + a + + s + g + + + + coming + + c + C + e + t + ¢ + . + + o + a + e + u + O + n + + m + n + M + w + g + a + + i + I + l + u + a + + + n + m + h + u + p + o + + g + y + d + + to, + + + _ + g + + t + f + c + i + + o + 0 + a + O + e + + , + . + ; + y + + but + + + _ + . + + b + d + h + o + t + l + + u + + t + r + i + f + + it + + + / + _ + + i + l + I + + t + + was + + + _ + + w + v + + a + + s + g + e + + too + + + + t + c + + o + e + 0 + + o + a + e + + dark + + + _ + + d + c + g + e + + a + + r + ' + + k + + to + + + + t + + o + 0 + a + + see + + + + s + g + e + S + c + + e + c + + e + c + + anything; + + + + a + + n + + y + v + w + + t + c + r + + h + + i + I + + n + m + + g + + ; + : + , + + s + + then + + + _ + + t + c + r + + h + + e + + n + + she + + + + s + e + + h + b + n + + e + + looked + + + + l + L + + o + + o + a + + k + + e + + d + + at + + + + a + + t + r + k + + the + + + _ + + t + r + c + + h + b + + e + + sides + + + + s + + i + l + + d + + e + é + + s + g + + + + of + + o + c + e + a + 0 + O + + f + t + + i + { + £ + + the + + + _ + . + ' + - + / + + t + c + r + s + e + - + + h + n + b + m + + e + c + a + + well, + + + _ + - + + w + v + W + + m + n + + e + . + s + + l + I + + i + 1 + + l + 1 + I + ] + ) + . + + , + ; + . + + and + + + _ + + a + e + + n + u + m + o + i + v + + d + g + 0 + + noticed + + + _ + + n + m + h + u + p + v + + o + a + + t + r + k + c + f + + i + + c + e + o + s + t + + e + c + a + + d + c + g + l + + that + + + + t + r + c + + h + b + n + + a + e + + t + r + e + + they + + + _ + ' + + t + c + s + r + - + e + + h + + e + c + + y + v + + were + + + _ + + w + v + + e + + r + y + + e + c + + filled + + + + f + + i + u + l + a + + l + + l + i + + e + a + + d + l + @ + g + + with + + + + w + v + m + + i + + t + s + + h + n + + cupboards + + + + c + e + + u + + p + y + n + m + g + + b + h + + o + + a + + r + + d + + s + g + e + + and + + + + a + . + + n + u + m + v + + d + . + + book- + + + _ + / + + b + h + d + + o + + o + + k + + - + . + ~ + + + + + shelves; + + s + S + e + g + c + a + + h + b + n + s + m + o + + e + a + . + é + c + o + + l + | + ] + + I + _ + + v + w + y + u + c + r + + e + . + g + c + + s + g + e + x + r + z + + ; + : + , + + s + } + + here + + + _ + + h + n + m + b + l + o + + e + c + + r + + e + c + s + + and + + + + a + e + + n + m + + d + q + + there + + + + t + c + + h + + e + + r + + e + c + + she + + + _ + + s + g + + h + b + n + + e + c + + saw + + + _ + + s + g + e + + a + e + o + + w + v + m + n + u + + maps + + + + m + n + w + M + + a + g + + p + n + g + o + h + y + + s + g + e + z + c + + and + + + + a + e + + n + m + + d + q + + pictures + + + + p + y + n + + i + I + + c + e + + t + + u + w + + r + + e + + s + g + e + + hung + + + _ + + h + b + n + + u + + n + m + + g + + upon + + + + u + w + y + m + i + a + + p + y + + o + 0 + + n + m + + pegs. + + + + p + y + + e + . + + g + + s + g + e + z + : + + . + + She + + + + S + + h + + e + + took + + + + t + + o + + o + + k + + + + down + + d + c + q + g + l + a + + o + + e + a + c + O + + w + v + m + + n + W + + n + m + h + y + » + o + + a + + + _ + . + + a + e + o + « + 4 + @ + + jar + + + + j + J + f + i + y + l + + a + + r + t + v + y + i + x + + from + + + + f + t + i + + r + f + t + l + i + + o + a + e + u + n + + m + n + + one + + + _ + + o + O + + n + m + u + h + p + + e + . + g + é + + of + + + + o + O + + f + + + the + + + ' + + t + c + r + + h + b + n + + e + + shelves + + + + s + g + e + S + + h + b + + e + + l + + + v + w + y + r + + e + . + g + + s + g + e + r + . + + as + + + _ + / + + a + s + + s + g + e + z + + she + + + + s + g + S + e + c + + h + n + b + + e + + passed; + + + + p + a + y + + a + . + + s + g + a + e + + s + g + e + + e + + d + q + + ; + + it + + + + i + t + + t + f + s + + was + + + _ + + w + v + + a + + s + g + e + z + : + + labelled + + + s + + l + i + + a + + b + d + + e + a + + l + + l + i + + e + a + o + + d + l + g + + ‘ORANGE + + + + + ( + ' + \ + ; + { + + O + Q + + R + + A + + N + + G + C + + E + + + + MARMALADE’, + + M + O + D + N + I + V + + A + S + E + a + O + I + + R + E + B + K + A + k + + M + N + O + I + W + + A + S + C + O + G + E + + L + I + E + l + U + . + + A + S + O + I + + G + + D + P + N + V + H + U + + E + F + P + B + R + K + + + ' + ; + , + + , + + but + + + + b + o + h + v + d + + u + + t + + to + + + + t + + o + + her + + + + h + b + + e + + r + y + + great + + + + g + s + z + + r + + e + + a + + t + r + + disappointment + + + _ + + d + c + + i + + s + g + e + + a + e + + p + y + n + a + + p + + o + a + + i + l + t + + n + + t + i + r + : + - + + m + + e + + n + m + + t + + it + + + _ + + i + + t + + was + + + _ + + w + v + + a + + s + g + e + + empty: + + + + e + + m + + p + + t + + y + v + + : + ; + + she + + + + s + + h + + e + + did + + + + d + + i + + d + + not + + + + n + m + r + p + u + + o + a + + t + k + + + + like + + l + i + I + f + t + L + + i + l + t + ' + I + k + + k + x + c + e + t + s + + e + c + a + é + s + g + + to + + + _ + ' + . + + t + w + c + - + r + i + + o + a + e + 0 + c + g + + drop + + + _ + . + - + ' + + d + c + e + g + l + + r + t + v + + o + a + u + e + n + c + + p + n + g + y + + the + + + ' + + t + c + r + e + s + f + + h + b + n + + e + . + a + + jar + + + + j + J + f + s + i + y + + a + + r + y + , + + for + + + + f + t + + o + + r + + fear + + + + f + t + + e + a + c + + a + e + . + + r + y + , + x + i + + of + + + _ + - + + o + a + 0 + c + + f + t + + + + killing + + + _ + ' + + k + K + x + i + + i + I + + l + I + . + f + + l + i + f + + i + l + + n + u + r + + g + s + e + ¢ + , + : + + somebody, + + + . + + s + e + c + g + + o + a + e + + m + w + n + + e + c + a + o + + b + h + o + + o + a + + d + + y + v + + , + . + + so + + + + s + + o + c + + managed + + + _ + + m + + a + + n + m + h + p + + a + g + + g + + e + + d + + to + + + + t + f + + o + 0 + O + + put + + + + p + + u + + t + + it + + + + i + + t + ¢ + f + + into + + + _ + / + ' + + i + + n + m + + t + + o + + + + one + + o + a + e + 0 + c + O + + n + m + v + p + r + i + + e + é + . + g + - + c + + of + + + _ + . + / + o + - + + o + 0 + a + O + e + + f + t + + , + i + + the + + + _ + + t + c + r + + h + n + + e + c + + a + + cupboards + + + _ + + c + C + e + t + o + + u + y + w + v + n + + p + y + n + b + + b + h + o + + o + a + + a + + + r + t + v + i + e + + + d + g + q + u + c + + s + e + + as + + + + a + + s + g + z + e + c + x + + she + + + . + + s + g + e + + h + b + + e + g + + fell + + + + f + + + e + c + é + g + . + + l + I + + l + ] + 1 + ! + I + + + past + + + _ + . + + p + y + + a + . + + s + e + a + c + + t + e + + it. + + + + i + I + + t + k + + . + , + - + + +

+ +

+ + ‘Well!’ + + + + " + ' + * + + + W + V + w + N + + M + + e + é + . + E + a + r + + l + i + I + 1 + M + ( + + l + i + d + ! + t + f + + ! + l + t + i + ' + + + + + ? + ' + " + * + + thought + + + . + _ + : + , + + t + c + r + - + e + s + + h + b + n + + o + a + + u + w + v + n + r + y + + g + + h + + t + r + i + e + + Alice + + + _ + + A + + l + _ + + i + l + é + t + I + s + + c + e + + e + c + + to + + + + t + c + r + f + e + + o + a + e + + herself, + + + _ + + h + + e + + r + + s + e + c + + e + . + + l + i + + f + t + + , + . + ; + + ‘after + + + ' + + + ' + S + + + \ + + a + + f + t + + t + i + + e + c + + r + , + y + + such + + + + s + e + + u + + c + e + + h + n + b + + a + + + + a + + fall + + + + f + t + i + + + a + e + d + + l + i + + l + ] + 1 + I + i + | + + as + + + + a + + s + g + e + + this, + + + + t + c + r + + h + + i + + s + g + + , + + I + + + + I + + shall + + + + s + g + + h + b + n + + a + + l + + l + ] + 1 + I + | + ! + + think + + + + t + + h + + i + + n + + k + + + + nothing + + n + m + h + N + p + i + + o + e + a + c + 0 + O + + t + c + - + r + e + v + + h + b + n + l + + i + l + + t + I + + n + o + i + + g + e + s + + of + + + _ + - + + o + 0 + + f + t + + i + , + + tumbling + + + _ + + t + r + - + f + + u + y + + m + n + w + o + a + + b + h + D + l + o + + l + i + + i + l + + n + m + h + g + + g + e + ¢ + s + c + , + + down + + + + d + c + g + e + a + l + + o + a + + w + + n + m + h + » + o + + stairs! + + + . + + s + e + + t + k + c + s + + a + g + s + d + z + . + + i + I + a + + r + ' + + s + g + e + c + + ! + | + ' + l + : + } + + How + + + _ + + H + + o + + w + + brave + + + + b + h + p + D + o + d + + r + + a + + v + w + y + + e + + they’ll + + + + t + c + r + e + + h + + e + + y + v + + + ' + + " + 7 + + l + 1 + | + ! + I + ] + + l + ] + + all + + + + a + e + + l + ] + + l + + think + + + + t + + h + + i + + n + + k + + me + + + + m + n + + e + é + + at + + + _ + + a + + t + + home! + + + _ + + h + b + + o + + m + n + + e + + ! + ' + l + + + + Why, + + W + V + N + M + w + U + + h + b + l + k + n + o + + y + r + i + v + j + ) + + , + . + + + I + + + / + + I + | + [ + T + + l + + wouldn’t + + + ' + _ + + w + v + m + + o + + u + w + n + a + m + r + + l + + d + + n + h + m + y + o + u + + + ' + + " + ? + + t + e + + say + + + + s + g + e + + a + + y + v + + anything + + + + a + e + + n + m + + y + v + w + + t + + h + b + + i + + n + + g + + about + + + + a + e + + b + h + o + + o + + u + w + + t + + it, + + + + i + I + j + l + + t + + , + . + + even + + + + e + + v + w + + e + a + + n + m + + if + + + + i + I + + f + t + + I + + + + I + l + [ + 1 + | + T + + fell + + + + f + t + + e + + l + + l + + off + + + + o + 0 + a + + f + t + + f + t + + + the + + + + t + + h + + e + + top + + + + t + + o + + p + y + + of + + + + o + + f + t + + + the + + + + t + + h + + e + + house!’ + + + + h + b + n + + o + + u + + s + e + + e + + ! + l + | + + } + ' + + + + + +

+
+
+ + diff --git a/test/testdata/tess_choices_charboxes.hocr b/test/testdata/tess_choices_charboxes.hocr new file mode 100644 index 0000000..e34864e --- /dev/null +++ b/test/testdata/tess_choices_charboxes.hocr @@ -0,0 +1,13146 @@ + + + + + + + + + + +
+
+

+ + + 1 + + 1 + l + I + ] + i + | + + + + D + + D + P + b + M + O + W + + o + + o + + w + + w + v + + n + + n + m + h + » + + + + t + + t + + h + + h + b + + e + + e + + + + R + + R + + a + + a + + b + + b + h + + b + + b + h + + i + + i + + t + + t + + - + + - + + + H + + H + + o + + o + + l + + l + + e + + e + + + +

+
+
+

+ + + A + + A + M + h + d + l + ( + + l + + l + _ + ] + i + I + + i + + i + l + t + I + j + a + + c + + c + e + o + t + C + s + + e + + e + c + g + s + é + a + + + + w + + w + v + m + + a + + a + e + + s + + s + e + g + a + : + c + + + + b + + b + h + o + D + d + l + + e + + e + + g + + g + s + y + + i + + i + I + t + l + a + + n + + n + m + h + o + u + + n + + n + m + h + u + o + p + + i + + i + I + l + + n + + n + + g + + g + + + + t + + t + c + f + e + + o + + o + 0 + a + O + + + + g + + g + + e + + e + é + c + + t + + t + f + r + + + + v + + v + w + y + - + c + + e + + e + . + + r + + r + + y + + y + v + w + g + + + + t + + t + + i + + i + + r + + r + v + + e + + e + + d + + d + + + + o + + o + + f + + f + + + + s + + s + e + g + + i + + i + + t + + t + + t + + t + i + s + b + ¢ + r + + i + + i + + n + + n + + g + + g + e + s + , + z + ¢ + + + + b + + b + + y + + y + + + + h + + h + b + + e + + e + + r + + r + y + + + + s + + s + + i + + i + l + + s + + s + e + g + + t + + t + + e + + e + + r + + r + , + t + y + . + + + + o + + o + + n + + n + m + + + + t + + t + c + + h + + h + + e + + e + c + + + + b + + b + + a + + a + + n + + n + m + u + + k + + k + + , + + , + . + + + + + + a + + a + g + 4 + 2 + . + e + + n + + n + m + u + p + i + + d + + d + q + 4 + a + c + g + + + + o + + o + 0 + e + a + O + + f + + f + t + i + + + + h + + h + n + b + m + l + + a + + a + e + + v + + v + w + - + y + c + r + + i + + i + l + + n + + n + o + u + m + i + + g + + g + d + + + + n + + n + m + p + h + u + y + + o + + o + a + e + c + + t + + t + c + r + f + + h + + h + b + l + n + + i + + i + I + + n + + n + m + + g + + g + + + + t + + t + f + c + + o + + o + 0 + a + e + c + + + + d + + d + l + c + + o + + o + a + 0 + n + u + O + + : + + : + ; + . + - + s + + + + o + + o + a + + n + + n + m + u + v + + c + + c + e + ¢ + t + + e + + e + . + c + s + + + + o + + o + a + + r + + r + ' + + + + t + + t + + w + + w + v + + i + + i + I + l + + c + + c + e + C + + e + + e + c + + + + s + + s + g + + h + + h + + e + + e + c + + + + h + + h + b + n + m + + a + + a + + d + + d + c + + + + p + + p + y + e + n + + e + + e + . + + e + + e + a + c + + p + + p + n + y + j + g + v + + e + + e + + d + + d + + + + i + + i + + n + + n + m + u + h + + t + + t + + o + + o + + + + t + + t + + h + + h + + e + + e + + + + b + + b + d + h + + o + + o + + o + + o + + k + + k + + + + h + + h + + e + + e + + r + + r + + + + + + s + + s + S + e + g + c + a + + i + + i + I + l + t + 1 + + s + + s + g + c + e + t + z + + t + + t + c + v + i + w + + e + + e + c + a + + r + + r + . + t + + + + w + + w + v + m + + a + + a + e + + s + + s + e + g + + + + r + + r + v + t + f + , + m + + e + + e + + a + + a + e + g + z + + d + + d + c + q + l + + i + + i + I + + n + + n + m + u + h + + g + + g + + , + + , + . + + + + b + + b + o + h + D + + u + + u + + t + + t + r + + + + i + + i + l + I + t + + t + + t + c + ¢ + s + e + - + + + + h + + h + n + b + H + + a + + a + e + + d + + d + c + + + + n + + n + m + p + v + u + + o + + o + + + + p + + p + j + y + n + + i + + i + I + + c + + c + e + + t + + t + + u + + u + + r + + r + + e + + e + + s + + s + + + + o + + o + + r + + r + + + + c + + c + e + C + t + + o + + o + + n + + n + m + + v + + v + + e + + e + + r + + r + + s + + s + + a + + a + + t + + t + + i + + i + + o + + o + + n + + n + m + + s + + s + e + g + + + + i + + i + + n + + n + m + + + + i + + i + I + + t + + t + + , + + , + . + + + + + + + ' + + ( + ! + : + + a + + a + e + + n + + n + + d + + d + g + + + + w + + w + v + + h + + h + + a + + a + + t + + t + + + + i + + i + j + I + + s + + s + + + + + + t + + t + c + - + r + s + e + + h + + h + n + b + m + l + o + + e + + e + c + é + s + a + o + + + + u + + u + i + a + w + y + + s + + s + e + g + S + + e + + e + c + + + + o + + o + 0 + c + a + O + e + + f + + f + t + , + + + + a + + a + 2 + 4 + e + d + c + + + + b + + b + h + d + o + l + v + + o + + o + + o + + o + a + n + + k + + k + + , + + , + . + + + + + + ? + ' + " + ® + + + + t + + t + + h + + h + + o + + o + + u + + u + n + + g + + g + s + + h + + h + b + + t + + t + + + + A + + A + + l + + l + + i + + i + l + I + + c + + c + e + + e + + e + c + + + + + + + + w + + w + v + m + + i + + i + I + l + + t + + t + + h + + h + + o + + o + + u + + u + + t + + t + + + + p + + p + y + n + b + + i + + i + + c + + c + e + + t + + t + + u + + u + + r + + r + + e + + e + + s + + s + g + e + + + + o + + o + c + a + e + O + + r + + r + , + + + + c + + c + e + s + o + C + g + + o + + o + + n + + n + + v + + v + w + y + V + + e + + e + + r + + r + + s + + s + g + + a + + a + + t + + t + + i + + i + I + + o + + o + + n + + n + m + u + + ? + + ? + + + 7 + + + + + + ? + ' + + + +

+ +

+ + + S + + S + 5 + G + s + $ + 8 + + o + + o + a + O + e + 0 + + + + s + + s + e + g + c + S + r + + h + + h + b + n + m + o + l + + e + + e + c + é + a + o + s + + + + w + + w + v + m + n + + a + + a + . + s + + s + + s + e + c + : + + + + c + + c + e + C + o + g + t + + o + + o + a + O + + n + + n + m + u + p + h + o + + s + + s + e + c + a + n + g + + i + + i + I + j + l + + d + + d + + e + + e + + r + + r + t + , + v + + i + + i + l + + n + + n + m + u + h + + g + + g + + + + i + + i + l + + n + + n + m + o + + + + h + + h + b + + e + + e + + r + + r + , + t + + + + o + + o + O + a + 0 + n + u + + w + + w + v + m + + n + + n + m + u + p + + + + m + + m + n + + i + + i + + n + + n + m + h + + d + + d + q + 4 + g + + + + ( + + ( + { + [ + + a + + a + e + + s + + s + e + g + z + x + + + + w + + w + v + m + + e + + e + + l + + l + + l + + l + ] + 1 + ! + + + + a + + a + e + + s + + s + g + e + + + + s + + s + g + + h + + h + + e + + e + + + + c + + c + e + + o + + o + O + + u + + u + + l + + l + ] + + d + + d + + , + + , + . + + + + f + + f + t + + o + + o + + r + + r + + + + t + + t + c + r + + h + + h + + e + + e + + + + h + + h + b + n + l + + o + + o + a + e + + t + + t + + + + + + d + + d + c + g + q + l + i + + a + + a + e + g + 4 + s + 2 + + y + + y + v + w + g + p + r + + + + m + + m + n + w + M + o + + a + + a + e + . + o + g + + d + + d + c + q + a + g + 4 + + e + + e + é + . + c + g + E + + + + h + + h + b + l + + e + + e + c + + r + + r + t + y + , + s + e + + + + f + + f + t + + { + , + + e + + e + c + é + . + + e + + e + c + é + a + g + s + + l + + l + ] + 1 + | + . + i + + + + v + + v + w + y + + e + + e + + r + + r + + y + + y + + + + s + + s + S + e + + l + + l + + e + + e + + e + + e + a + c + o + g + p + + p + + p + y + n + g + + y + + y + v + w + r + + + + a + + a + e + + n + + n + u + + d + + d + 4 + + + + s + + s + g + e + S + + t + + t + c + r + s + e + + u + + u + y + + p + + p + y + n + b + + i + + i + l + I + j + t + + d + + d + c + + ) + + ) + ] + + , + + , + . + ; + + + + w + + w + v + m + + h + + h + + e + + e + + t + + t + + h + + h + b + + e + + e + + r + + r + + + + t + + t + c + r + + h + + h + + e + + e + c + + + + p + + p + y + + l + + l + + e + + e + c + + a + + a + e + + s + + s + g + e + a + + u + + u + + r + + r + + e + + e + + + + o + + o + c + + f + + f + + + + m + + m + + a + + a + + k + + k + + i + + i + I + + n + + n + + g + + g + + + + a + + a + + + + + + d + + d + c + a + g + l + q + + a + + a + g + s + e + 4 + i + + i + + i + I + t + l + j + a + + s + + s + g + z + e + a + r + + y + + y + v + w + a + g + p + + - + + - + ~ + + _ + . + + c + + c + e + C + t + o + + h + + h + b + n + m + + a + + a + + i + + i + l + t + a + + n + + n + m + i + u + + + + w + + w + v + m + + o + + o + + u + + u + w + n + m + + l + + l + + d + + d + + + + b + + b + h + o + + e + + e + é + + + + w + + w + v + m + + o + + o + a + + r + + r + s + + t + + t + r + + h + + h + b + + + + t + + t + + h + + h + + e + + e + + + + t + + t + c + + r + + r + + o + + o + + u + + u + w + v + + b + + b + h + + l + + l + + e + + e + . + + + + o + + o + + f + + f + t + + + + g + + g + z + e + s + v + ¢ + + e + + e + + t + + t + + t + + t + i + r + s + + i + + i + l + t + + n + + n + m + + g + + g + + + + u + + u + w + y + v + n + r + + p + + p + y + + + + a + + a + + n + + n + + d + + d + + + + p + + p + y + + i + + i + I + l + + c + + c + e + + k + + k + + i + + i + + n + + n + + g + + g + + + + t + + t + r + c + + h + + h + + e + + e + + + + d + + d + q + + a + + a + u + i + + i + + i + I + a + + s + + s + e + + i + + i + I + + e + + e + . + + s + + s + g + e + z + + , + + , + . + + + + + + w + + w + v + W + m + o + y + + h + + h + b + l + k + n + H + + e + + e + . + c + o + a + s + + n + + n + m + h + o + u + v + + + + s + + s + g + e + S + c + a + + u + + u + y + w + i + a + + d + + d + c + l + q + e + + d + + d + c + q + g + e + + e + + e + é + s + c + a + g + + n + + n + m + h + u + o + a + + l + + l + i + + y + + y + v + w + a + + + + a + + a + 2 + 4 + e + z + d + + + + W + + W + V + N + Y + M + + h + + h + b + + i + + i + I + + t + + t + r + c + + e + + e + c + s + g + é + + + + R + + R + K + F + N + B + + a + + a + + b + + b + h + l + p + + b + + b + h + l + D + + i + + i + I + j + + t + + t + + + + w + + w + v + m + + i + + i + I + + t + + t + c + r + + h + + h + n + + + + p + + p + + i + + i + l + + n + + n + m + + k + + k + + + + e + + e + c + + y + + y + v + w + + e + + e + + s + + s + g + e + x + r + + + + r + + r + ' + v + + a + + a + + n + + n + m + y + + + + c + + c + e + C + + l + + l + + o + + o + + s + + s + g + + e + + e + c + é + g + + + + b + + b + h + o + l + p + + y + + y + v + w + + + + h + + h + b + + e + + e + + r + + r + t + + . + + . + , + + + +

+ +

+ + + T + + T + [ + Y + l + I + t + + h + + h + l + b + k + n + o + + e + + e + . + c + o + s + a + + r + + r + t + ' + v + i + - + + e + + e + c + é + s + g + a + + + + w + + w + v + m + n + W + y + + a + + a + e + + s + + s + e + g + c + a + z + + + + n + + n + m + h + i + + o + + o + e + a + c + + t + + t + c + r + - + + h + + h + b + + i + + i + l + t + + n + + n + o + + g + + g + e + s + y + c + + + + s + + s + g + e + S + c + + o + + o + c + a + O + s + e + + + + V + + V + W + M + Y + + E + + E + L + F + B + + R + + R + + Y + + Y + + + + r + + r + + e + + e + c + + m + + m + + a + + a + e + + r + + r + + k + + k + K + + a + + a + s + + b + + b + p + h + o + D + + l + + l + i + I + + e + + e + . + c + g + + + + i + + i + I + l + t + + n + + n + m + + + + t + + t + c + r + + h + + h + b + + a + + a + e + + t + + t + + ; + + ; + , + : + } + s + + + + n + + n + m + + o + + o + + r + + r + + + + d + + d + + i + + i + l + + d + + d + + + + A + + A + + l + + l + + i + + i + l + t + I + é + . + + c + + c + e + o + + e + + e + + + + t + + t + + h + + h + + i + + i + + n + + n + + k + + k + + + + i + + i + + t + + t + + + + s + + s + g + S + + o + + o + + + + + + V + + V + Y + W + U + M + v + + E + + E + L + I + B + F + K + + R + + R + E + B + F + P + K + + Y + + Y + V + T + y + ' + + + + m + + m + n + w + i + a + M + + u + + u + w + a + U + i + q + + c + + c + e + t + + h + + h + b + k + l + n + + + + o + + o + O + a + 0 + + u + + u + w + v + o + + t + + t + r + k + + + + o + + o + 0 + + f + + f + t + i + + + + t + + t + c + i + + h + + h + n + b + m + + e + + e + c + a + + + + w + + w + v + + a + + a + + y + + y + v + + + + t + + t + c + r + + o + + o + a + 0 + e + + + + h + + h + b + + e + + e + c + + a + + a + e + + r + + r + , + + + + t + + t + c + + h + + h + + e + + e + c + + + + R + + R + N + K + F + + a + + a + e + + b + + b + + b + + b + h + D + p + l + o + + i + + i + I + l + a + + t + + t + e + r + + + + s + + s + e + S + g + + a + + a + + y + + y + v + w + + + + t + + t + f + c + + o + + o + + + + i + + i + I + + t + + t + + s + + s + g + e + + e + + e + c + é + a + s + + l + + l + ] + | + I + 1 + . + + f + + f + t + i + £ + + , + + , + ; + . + y + j + r + + + + + + + + ( + ' + + \ + + O + + O + 0 + + h + + h + b + + + + d + + d + + e + + e + + a + + a + + r + + r + + ! + + ! + ' + | + + + + O + + O + Q + 0 + + h + + h + b + l + n + + + + + + d + + d + c + q + g + a + l + + e + + e + c + g + s + é + . + + a + + a + e + . + , + s + + r + + r + t + i + n + m + v + + ! + + ! + : + ' + | + / + + + + + I + + I + T + [ + l + | + J + + + + s + + s + g + e + S + c + + h + + h + b + n + + a + + a + e + + l + + l + i + I + + l + + l + ] + 1 + I + i + ! + + + + b + + b + o + h + v + + e + + e + + + + l + + l + I + i + L + + a + + a + + t + + t + + e + + e + + ! + + ! + l + | + ' + ) + + + + + + + ' + ? + + + + ( + + ( + + [ + + + w + + w + v + + h + + h + + e + + e + + n + + n + m + + + + s + + s + + h + + h + + e + + e + c + + + + t + + t + + h + + h + + o + + o + + u + + u + + g + + g + + h + + h + + t + + t + + + + i + + i + I + j + l + + t + + t + + + + o + + o + + v + + v + + e + + e + + r + + r + + + + a + + a + e + + f + + f + t + + t + + t + + e + + e + + r + + r + + w + + w + + a + + a + + r + + r + + d + + d + + s + + s + + , + + , + + + + i + + i + + t + + t + + + + o + + o + + c + + c + e + + c + + c + e + o + + u + + u + + r + + r + + r + + r + + e + + e + + d + + d + + + + t + + t + + o + + o + a + + + + + + h + + h + n + m + b + H + l + + e + + e + . + c + o + a + s + + r + + r + t + ' + i + y + s + + + + t + + t + c + r + - + f + C + + h + + h + b + n + m + H + + a + + a + e + o + c + + t + + t + r + c + i + e + + + + s + + s + e + g + c + S + + h + + h + b + n + m + + e + + e + c + + + + o + + o + c + a + 0 + n + O + + u + + u + v + w + n + r + y + + g + + g + s + + h + + h + b + n + + t + + t + r + + + + t + + t + + o + + o + + + + h + + h + b + + a + + a + e + + v + + v + w + y + + e + + e + c + + + + w + + w + v + m + + o + + o + + n + + n + m + + d + + d + q + c + + e + + e + c + . + é + + r + + r + v + n + + e + + e + c + . + + d + + d + c + + + + a + + a + + t + + t + i + k + + + + t + + t + f + c + r + + h + + h + b + + i + + i + I + + s + + s + e + g + a + z + + , + + , + ; + + + + b + + b + d + h + o + t + l + + u + + u + + t + + t + r + + + + a + + a + e + + t + + t + r + + + + t + + t + + h + + h + + e + + e + c + + + + t + + t + + i + + i + a + + m + + m + n + w + + e + + e + . + + + + i + + i + I + j + + t + + t + + + + a + + a + + l + + l + 1 + + l + + l + i + 1 + ] + + + + s + + s + e + g + c + + e + + e + c + + e + + e + o + a + c + s + + m + + m + n + w + + e + + e + c + + d + + d + + + + + + q + + q + g + a + d + Q + m + + u + + u + o + n + w + v + a + + i + + i + I + l + a + t + é + + t + + t + c + r + f + i + + e + + e + c + s + é + g + + + + n + + n + m + h + u + p + i + + a + + a + e + + t + + t + c + + u + + u + o + w + + r + + r + v + y + t + , + m + + a + + a + e + . + , + + l + + l + 1 + ] + + ) + + ) + } + ] + j + J + / + + ; + + ; + , + : + } + s + > + + + + b + + b + d + p + + u + + u + y + w + + t + + t + + + + w + + w + v + + h + + h + + e + + e + + n + + n + + + + t + + t + c + r + + h + + h + + e + + e + c + + + + R + + R + + a + + a + + b + + b + h + + b + + b + + i + + i + I + + t + + t + + + + a + + a + + c + + c + e + + t + + t + + u + + u + w + v + + a + + a + e + d + + l + + l + + l + + l + i + t + + y + + y + Y + v + + + + T + + T + Y + C + [ + t + + O + + O + 0 + o + Q + © + + O + + O + 0 + o + + K + + K + X + + + + A + + A + L + 4 + + + + W + + W + N + V + M + + A + + A + + T + + T + Y + I + + C + + C + O + + H + + H + + + + O + + O + Q + 0 + + U + + U + + T + + T + + + + O + + O + 0 + + F + + F + + + + + + I + + I + 1 + T + a + ! + [ + + T + + T + t + Y + [ + f + I + + S + + S + E + G + s + C + $ + + + + W + + W + V + N + M + Y + T + + A + + A + + I + + I + J + T + L + + S + + S + G + E + + T + + T + t + ' + Y + f + . + + C + + C + G + c + O + S + T + + O + + O + Q + 0 + + A + + A + + T + + T + Y + + - + + - + _ + . + ~ + + , + + + + P + + P + F + B + + O + + O + + C + + C + + K + + K + + E + + E + + T + + T + + , + + , + + + + a + + a + + n + + n + m + u + + d + + d + + + + l + + l + L + + o + + o + + o + + o + + k + + k + + e + + e + + d + + d + + + + a + + a + + t + + t + + + + i + + i + j + I + + t + + t + + , + + , + ; + + + + a + + a + + n + + n + + d + + d + + + + t + + t + c + r + + h + + h + + e + + e + + n + + n + m + + + + h + + h + b + + u + + u + + r + + r + t + + r + + r + t + n + + i + + i + l + + e + + e + c + + d + + d + + + + o + + o + 0 + + n + + n + m + + , + + , + . + + + + A + + A + + l + + l + + i + + i + + c + + c + + e + + e + + + + + + s + + s + e + c + S + g + x + + t + + t + c + e + r + - + s + + a + + a + e + g + o + c + . + + r + + r + t + i + c + ' + + t + + t + - + c + v + r + = + + e + + e + c + a + + d + + d + l + c + + + + t + + t + r + c + e + + o + + o + a + + + + h + + h + b + l + + e + + e + + r + + r + + + + f + + f + t + + e + + e + c + + e + + e + c + a + s + é + o + + t + + t + r + c + + , + + , + . + ; + + + + f + + f + t + T + j + + o + + o + a + e + + r + + r + + + + i + + i + + t + + t + c + - + s + r + e + + + + f + + f + t + ( + + l + + l + i + + a + + a + e + s + g + + s + + s + g + S + c + e + + h + + h + n + b + m + + e + + e + c + + d + + d + + + + a + + a + e + s + + c + + c + e + + r + + r + t + + o + + o + a + c + + s + + s + g + e + c + + s + + s + e + g + c + + + + h + + h + b + + e + + e + a + + r + + r + , + t + y + . + + + + m + + m + + i + + i + + n + + n + + d + + d + + + + t + + t + c + r + + h + + h + + a + + a + e + + t + + t + r + + + + s + + s + e + g + + h + + h + b + + e + + e + c + a + + + + h + + h + + a + + a + e + + d + + d + l + + + + n + + n + m + u + + e + + e + + v + + v + w + + e + + e + . + + r + + r + + + + b + + b + h + + e + + e + + f + + f + + o + + o + + r + + r + + e + + e + + + + + + s + + s + e + g + S + c + x + + e + + e + c + o + s + g + - + + e + + e + c + o + a + s + . + + n + + n + m + u + i + y + + + + a + + a + 2 + 4 + e + z + + + + r + + r + , + + a + + a + e + + b + + b + + b + + b + h + l + + i + + i + I + + t + + t + e + i + + + + w + + w + + i + + i + I + + t + + t + r + c + + h + + h + n + + + + e + + e + c + é + g + s + a + + i + + i + l + j + I + t + f + + t + + t + c + r + s + + h + + h + + e + + e + + r + + r + + + + a + + a + 2 + 4 + e + « + o + + + + w + + w + + a + + a + + i + + i + I + j + l + + s + + s + e + + t + + t + + c + + c + e + o + ¢ + + o + + o + + a + + a + + t + + t + + - + + - + + ~ + . + _ + + p + + p + y + n + b + v + + o + + o + + c + + c + e + + k + + k + + e + + e + + t + + t + + , + + , + . + + + + o + + o + c + + r + + r + , + + + + a + + a + 2 + 4 + e + @ + « + + + + w + + w + v + m + + a + + a + + t + + t + + c + + c + e + o + ¢ + + h + + h + + + + t + + t + c + f + e + + o + + o + a + + + + t + + t + + a + + a + + k + + k + + e + + e + + + + o + + o + + u + + u + + t + + t + + + + o + + o + + f + + f + + + + i + + i + + t + + t + ¢ + + , + + , + ; + . + y + + + + a + + a + + n + + n + + d + + d + g + + + + + + b + + b + o + h + d + v + D + + u + + u + o + w + a + . + i + + r + + r + ' + , + i + m + y + + n + + n + m + h + p + u + a + + i + + i + I + l + t + + n + + n + i + h + o + m + + g + + g + y + . + + + + w + + w + v + W + m + + i + + i + I + l + + t + + t + c + + h + + h + b + n + + + + c + + c + e + C + o + + u + + u + o + w + + r + + r + t + y + , + s + . + + i + + i + j + I + t + l + + o + + o + c + a + n + + s + + s + c + + i + + i + + t + + t + + y + + y + + , + + , + . + ; + + + + s + + s + g + + h + + h + b + + e + + e + c + + + + r + + r + t + , + v + i + + a + + a + e + o + + n + + n + m + v + u + p + + + + a + + a + e + s + + c + + c + e + + r + + r + y + + o + + o + + s + + s + g + e + + s + + s + g + + + + t + + t + c + f + r + + h + + h + + e + + e + + + + f + + f + t + + i + + i + a + + e + + e + c + a + + l + + l + + d + + d + + + + a + + a + s + + f + + f + + t + + t + + e + + e + + r + + r + + + + i + + i + I + t + l + : + + t + + t + + , + + , + ; + . + + + + a + + a + + n + + n + + d + + d + + + + f + + f + + o + + o + + r + + r + t + + t + + t + + u + + u + y + w + + n + + n + m + i + u + + a + + a + e + + t + + t + r + + e + + e + + l + + l + + y + + y + v + w + r + + + + w + + w + + a + + a + + s + + s + g + : + e + + + + + + j + + j + J + g + y + f + l + + u + + u + w + y + n + a + t + + s + + s + e + S + c + g + a + + t + + t + c + r + e + . + « + + + + i + + i + a + u + I + + n + + n + m + u + i + o + + + + t + + t + c + C + r + f + + i + + i + l + m + t + + m + + m + n + w + a + + e + + e + . + s + a + g + c + + + + t + + t + f + c + C + e + b + + o + + o + O + a + 0 + e + + + + s + + s + S + + e + + e + c + é + + e + + e + c + é + g + s + ¢ + + + + i + + i + l + I + t + . + + t + + t + f + i + + + + p + + p + + o + + o + a + e + n + + p + + p + y + n + m + o + g + + + + d + + d + c + a + g + e + G + + o + + o + e + + w + + w + m + v + + n + + n + m + + + + a + + a + + + + l + + l + + a + + a + + r + + r + v + + g + + g + + e + + e + + + + r + + r + + a + + a + e + + b + + b + h + + b + + b + h + + i + + i + I + + t + + t + + - + + - + ~ + + h + + h + b + n + + o + + o + + l + + l + + e + + e + c + + + + u + + u + y + w + n + + n + + n + m + u + + d + + d + q + c + + e + + e + + r + + r + + + + t + + t + c + r + + h + + h + + e + + e + + + + h + + h + + e + + e + + d + + d + + g + + g + z + + e + + e + g + c + é + s + . + + . + + . + , + - + + + +

+ +

+ + + I + + I + i + l + T + 1 + + n + + n + m + h + p + N + i + + + + a + + a + e + z + g + s + d + + n + + n + m + i + y + + o + + o + c + e + a + s + 0 + + t + + t + c + r + - + + h + + h + n + + e + + e + c + + r + + r + , + t + . + c + y + + + + m + + m + n + M + w + + o + + o + a + e + O + + m + + m + + e + + e + + n + + n + m + + t + + t + r + k + s + + + + d + + d + c + l + + o + + o + + w + + w + m + v + + n + + n + m + h + + + + w + + w + v + m + W + q + + e + + e + . + + n + + n + m + h + u + + t + + t + + + + A + + A + S + + l + + l + + i + + i + + c + + c + + e + + e + . + + + + a + + a + + f + + f + t + { + + t + + t + + e + + e + + r + + r + y + + + + i + + i + l + I + + t + + t + + , + + , + ; + + + + n + + n + m + + e + + e + + v + + v + y + + e + + e + + r + + r + , + y + t + + + + o + + o + + n + + n + + c + + c + e + + e + + e + c + + + + c + + c + e + C + o + s + + o + + o + + n + + n + m + u + p + + s + + s + e + + i + + i + I + + d + + d + + e + + e + c + + r + + r + + i + + i + + n + + n + + g + + g + + + + h + + h + + o + + o + + w + + w + v + + + + + + i + + i + l + I + t + 1 + + n + + n + m + u + h + o + i + + + + t + + t + c + r + - + i + s + + h + + h + n + b + a + + e + + e + c + . + é + o + + + + w + + w + v + m + W + n + + o + + o + a + + r + + r + m + i + t + n + + l + + l + ] + + d + + d + + + + s + + s + e + g + + h + + h + b + + e + + e + + + + w + + w + v + m + + a + + a + g + + s + + s + c + g + + + + t + + t + + o + + o + O + 0 + + + + g + + g + s + + e + + e + + t + + t + f + r + + + + o + + o + O + 0 + a + c + e + + u + + u + + t + + t + r + + + + a + + a + e + + g + + g + + a + + a + g + + i + + i + I + + n + + n + m + p + h + + . + + . + + + +

+ +

+ + + T + + T + I + l + [ + Y + t + + h + + h + b + n + l + o + a + + e + + e + c + é + a + o + s + + + + r + + r + t + v + i + , + m + + a + + a + e + . + o + + b + + b + h + o + d + l + v + + b + + b + h + l + D + p + o + + i + + i + I + + t + + t + r + c + e + f + + - + + - + ~ + = + + . + + h + + h + b + + o + + o + e + c + a + 0 + + l + + l + i + + e + + e + c + s + + + + w + + w + v + m + + e + + e + + n + + n + m + h + + t + + t + r + k + c + + + + s + + s + e + + t + + t + c + r + + r + + r + v + i + + a + + a + e + + i + + i + l + t + I + . + + g + + g + z + s + + h + + h + n + b + i + + t + + t + r + e + + + + o + + o + 0 + + n + + n + m + + + + l + + l + L + I + + i + + i + + k + + k + + e + + e + c + + + + a + + a + e + o + 2 + « + 4 + + + + t + + t + r + c + + u + + u + + n + + n + m + u + p + o + i + + n + + n + m + + e + + e + a + + l + + l + ] + | + ) + + + + f + + f + + o + + o + a + e + + r + + r + , + + + + s + + s + g + e + c + S + + o + + o + a + + m + + m + w + + e + + e + c + + + + w + + w + + a + + a + + y + + y + + , + + , + + + + a + + a + e + + n + + n + m + + d + + d + + + + t + + t + + h + + h + + e + + e + + n + + n + m + + + + + + d + + d + c + q + a + e + g + + i + + i + l + t + I + ' + + p + + p + y + n + g + b + o + + p + + p + y + n + o + b + g + + e + + e + c + a + o + . + + d + + d + c + U + g + l + + + + s + + s + g + e + c + S + r + + u + + u + y + w + n + + d + + d + l + g + + d + + d + c + e + + e + + e + a + c + + n + + n + m + h + u + i + + l + + l + + y + + y + v + w + p + + + + d + + d + c + e + + o + + o + + w + + w + v + m + u + n + + n + + n + m + h + y + u + » + + , + + , + y + . + + + + s + + s + g + S + e + + o + + o + 0 + a + + + + s + + s + g + e + S + + u + + u + i + y + + d + + d + c + l + + d + + d + c + q + + e + + e + é + + n + + n + m + h + + l + + l + + y + + y + v + w + + + + t + + t + c + + h + + h + b + + a + + a + e + + t + + t + r + + + + A + + A + + l + + l + + i + + i + l + t + + c + + c + e + t + + e + + e + c + g + s + + + + h + + h + b + n + m + H + + a + + a + e + + d + + d + g + c + q + 0 + + + + n + + n + + o + + o + c + e + a + + t + + t + - + s + + + + a + + a + e + 2 + s + d + + + + m + + m + + o + + o + a + + m + + m + w + + e + + e + . + + n + + n + m + + t + + t + + + + t + + t + f + c + + o + + o + + + + t + + t + c + + h + + h + + i + + i + l + + n + + n + + k + + k + + + + + + a + + a + e + g + d + s + 2 + + b + + b + o + h + l + d + v + + o + + o + e + a + c + u + O + + u + + u + w + n + v + m + . + + t + + t + r + c + s + + + + s + + s + g + e + S + + t + + t + r + c + + o + + o + a + e + c + + p + + p + n + y + m + o + + p + + p + n + y + g + + i + + i + I + l + j + : + + n + + n + + g + + g + + + + h + + h + b + + e + + e + . + + r + + r + n + v + y + + s + + s + e + g + x + : + + e + + e + c + é + g + + l + + l + | + I + ] + i + ) + + f + + f + t + + + + b + + b + h + d + l + t + v + + e + + e + c + s + + f + + f + t + + o + + o + a + e + c + + r + + r + ' + + e + + e + c + + + + s + + s + g + e + c + + h + + h + b + + e + + e + c + + + + f + + f + t + + o + + o + a + + u + + u + w + + n + + n + m + + d + + d + q + c + + + + h + + h + + e + + e + + r + + r + + s + + s + e + + e + + e + + l + + l + I + ) + i + + f + + f + t + i + + + + f + + f + t + + a + + a + + l + + l + + l + + l + i + + i + + i + + n + + n + u + + g + + g + e + ¢ + s + c + z + + + + d + + d + c + G + + o + + o + a + e + + w + + w + v + m + + n + + n + m + u + + + + a + + a + + + + v + + v + w + y + V + + e + + e + + r + + r + + y + + y + v + w + + + + d + + d + c + + e + + e + c + + e + + e + o + c + + p + + p + n + y + j + + + + w + + w + v + + e + + e + + l + + l + i + + l + + l + + . + + . + , + : + + + +

+ +

+ + + E + + E + K + H + F + R + B + + i + + i + l + u + j + I + a + + t + + t + c + r + + h + + h + + e + + e + + r + + r + ' + t + + + + t + + t + c + s + r + e + - + + h + + h + b + + e + + e + c + + + + w + + w + v + m + W + + e + + e + . + + l + + l + i + + l + + l + ! + ] + 1 + I + | + + + + w + + w + v + m + + a + + a + + s + + s + + + + v + + v + w + y + + e + + e + + r + + r + t + + y + + y + v + w + + + + d + + d + c + + e + + e + c + + e + + e + g + . + + p + + p + n + g + + , + + , + . + + + + o + + o + + r + + r + + + + s + + s + g + e + + h + + h + + e + + e + + + + f + + f + + e + + e + é + + l + + l + 1 + + l + + l + i + ! + 1 + + + + v + + v + y + w + + e + + e + + r + + r + + y + + y + v + + + + s + + s + + l + + l + + o + + o + a + + w + + w + + l + + l + | + + y + + y + v + w + + , + + , + . + + + + f + + f + t + + o + + o + + r + + r + , + t + + + + s + + s + g + + h + + h + + e + + e + c + + + + h + + h + b + n + + a + + a + + d + + d + + + + p + + p + y + n + j + + l + + l + i + + e + + e + a + + n + + n + m + + t + + t + + y + + y + + + + + + o + + o + c + e + O + a + 0 + + f + + f + t + , + o + + + + + t + + t + r + l + i + f + T + + i + + i + l + t + m + r + + m + + m + w + n + a + i + + e + + e + c + a + s + é + g + + + + a + + a + e + 4 + g + s + . + + s + + s + e + g + c + z + r + + + + s + + s + g + e + c + S + a + + h + + h + b + n + + e + + e + c + + + + w + + w + v + m + W + + e + + e + c + + n + + n + m + h + i + + t + + t + r + i + s + e + c + + + + d + + d + c + a + e + l + g + + o + + o + a + + w + + w + m + v + + n + + n + m + h + y + » + o + + + + t + + t + f + r + + o + + o + 0 + a + + + + l + + l + + o + + o + c + a + e + + o + + o + c + + k + + k + g + + + + a + + a + e + g + s + . + + b + + b + h + l + + o + + o + + u + + u + + t + + t + r + + + + h + + h + b + l + n + m + + e + + e + + r + + r + , + + + + a + + a + e + 2 + + n + + n + + d + + d + + + + t + + t + c + - + + o + + o + a + + + + w + + w + v + + o + + o + + n + + n + m + + d + + d + + e + + e + + r + + r + + + + w + + w + v + + h + + h + + a + + a + e + + t + + t + r + + + + w + + w + v + m + + a + + a + + s + + s + e + : + + + + g + + g + s + z + e + ¢ + . + + o + + o + c + e + + i + + i + l + + n + + n + m + + g + + g + + + + + + t + + t + c + + + ¢ + i + - + + o + + o + a + e + c + 0 + s + + + + h + + h + n + b + m + H + l + + a + + a + e + o + c + g + + p + + p + y + n + a + g + m + + p + + p + y + n + o + w + + e + + e + + n + + n + m + h + u + o + p + + + + n + + n + m + h + p + y + v + + e + + e + + x + + x + z + r + n + + t + + t + r + f + + . + + . + , + - + : + _ + + + + F + + F + P + K + E + T + + i + + i + I + u + + r + + r + + s + + s + e + + t + + t + c + r + e + s + + , + + , + ; + . + + + + s + + s + g + e + c + S + + h + + h + n + b + + e + + e + + + + t + + t + c + + r + + r + + i + + i + l + + e + + e + c + + d + + d + l + + + + t + + t + - + r + + o + + o + a + e + c + 0 + g + + + + l + + l + + o + + o + e + c + O + a + 0 + + o + + o + c + a + O + + k + + k + t + + + + d + + d + c + a + g + G + J + + o + + o + c + e + a + O + 0 + + w + + w + + n + + n + m + h + + + + a + + a + e + + n + + n + + d + + d + + + + m + + m + + a + + a + + k + + k + + e + + e + + + + o + + o + + u + + u + v + w + + t + + t + r + + + + w + + w + v + m + + h + + h + + a + + a + e + + t + + t + + + + s + + s + g + + h + + h + + e + + e + + + + w + + w + v + + a + + a + + s + + s + g + + + + + + c + + c + C + e + t + ¢ + . + + o + + o + a + e + u + O + n + + m + + m + n + M + w + g + a + + i + + i + I + l + u + a + + n + + n + m + h + u + p + o + + g + + g + y + d + + + + t + + t + f + c + i + + o + + o + 0 + a + O + e + + , + + , + . + ; + y + + + + b + + b + d + h + o + t + l + + u + + u + + t + + t + r + i + f + + + + i + + i + l + I + + t + + t + + + + w + + w + v + + a + + a + + s + + s + g + e + + + + t + + t + c + + o + + o + e + 0 + + o + + o + a + e + + + + d + + d + c + g + e + + a + + a + + r + + r + ' + + k + + k + + + + t + + t + + o + + o + 0 + a + + + + s + + s + g + e + S + c + + e + + e + c + + e + + e + c + + + + a + + a + + n + + n + + y + + y + v + w + + t + + t + c + r + + h + + h + + i + + i + I + + n + + n + m + + g + + g + + ; + + ; + : + , + s + + + + t + + t + c + r + + h + + h + + e + + e + + n + + n + + + + s + + s + e + + h + + h + b + n + + e + + e + + + + l + + l + L + + o + + o + + o + + o + a + + k + + k + + e + + e + + d + + d + + + + a + + a + + t + + t + r + k + + + + t + + t + r + c + + h + + h + b + + e + + e + + + + s + + s + + i + + i + l + + d + + d + + e + + e + é + + s + + s + g + + + + + + o + + o + c + e + a + 0 + O + + f + + f + t + i + { + £ + + + + t + + t + c + r + s + e + - + + h + + h + n + b + m + + e + + e + c + a + + + + w + + w + v + W + m + n + + e + + e + . + s + + l + + l + I + i + 1 + + l + + l + 1 + I + ] + ) + . + + , + + , + ; + . + + + + a + + a + e + + n + + n + u + m + o + i + v + + d + + d + g + 0 + + + + n + + n + m + h + u + p + v + + o + + o + a + + t + + t + r + k + c + f + + i + + i + + c + + c + e + o + s + t + + e + + e + c + a + + d + + d + c + g + l + + + + t + + t + r + c + + h + + h + b + n + + a + + a + e + + t + + t + r + e + + + + t + + t + c + s + r + - + e + + h + + h + + e + + e + c + + y + + y + v + + + + w + + w + v + + e + + e + + r + + r + y + + e + + e + c + + + + f + + f + + i + + i + u + l + a + + l + + l + + l + + l + i + + e + + e + a + + d + + d + l + @ + g + + + + w + + w + v + m + + i + + i + + t + + t + s + + h + + h + n + + + + c + + c + e + + u + + u + + p + + p + y + n + m + g + + b + + b + h + + o + + o + + a + + a + + r + + r + + d + + d + + s + + s + g + e + + + + a + + a + . + + n + + n + u + m + v + + d + + d + . + + + + b + + b + h + d + + o + + o + + o + + o + + k + + k + + - + + - + . + ~ + + + + + + + s + + s + S + e + g + c + a + + h + + h + b + n + s + m + o + + e + + e + a + . + é + c + o + + l + + l + | + ] + I + _ + + v + + v + w + y + u + c + r + + e + + e + . + g + c + + s + + s + g + e + x + r + z + + ; + + ; + : + , + s + } + + + + h + + h + n + m + b + l + o + + e + + e + c + + r + + r + + e + + e + c + s + + + + a + + a + e + + n + + n + m + + d + + d + q + + + + t + + t + c + + h + + h + + e + + e + + r + + r + + e + + e + c + + + + s + + s + g + + h + + h + b + n + + e + + e + c + + + + s + + s + g + e + + a + + a + e + o + + w + + w + v + m + n + u + + + + m + + m + n + w + M + + a + + a + g + + p + + p + n + g + o + h + y + + s + + s + g + e + z + c + + + + a + + a + e + + n + + n + m + + d + + d + q + + + + p + + p + y + n + + i + + i + I + + c + + c + e + + t + + t + + u + + u + w + + r + + r + + e + + e + + s + + s + g + e + + + + h + + h + b + n + + u + + u + + n + + n + m + + g + + g + + + + u + + u + w + y + m + i + a + + p + + p + y + + o + + o + 0 + + n + + n + m + + + + p + + p + y + + e + + e + . + + g + + g + + s + + s + g + e + z + : + + . + + . + + + + S + + S + + h + + h + + e + + e + + + + t + + t + + o + + o + + o + + o + + k + + k + + + + + + d + + d + c + q + g + l + a + + o + + o + e + a + c + O + + w + + w + v + m + n + W + + n + + n + m + h + y + » + o + + + + a + + a + e + o + « + 4 + @ + + + + j + + j + J + f + i + y + l + + a + + a + + r + + r + t + v + y + i + x + + + + f + + f + t + i + + r + + r + f + t + l + i + + o + + o + a + e + u + n + + m + + m + n + + + + o + + o + O + + n + + n + m + u + h + p + + e + + e + . + g + é + + + + o + + o + O + + f + + f + + + + t + + t + c + r + + h + + h + b + n + + e + + e + + + + s + + s + g + e + S + + h + + h + b + + e + + e + + l + + l + + v + + v + w + y + r + + e + + e + . + g + + s + + s + g + e + r + . + + + + a + + a + s + + s + + s + g + e + z + + + + s + + s + g + S + e + c + + h + + h + n + b + + e + + e + + + + p + + p + a + y + + a + + a + . + + s + + s + g + a + e + + s + + s + g + e + + e + + e + + d + + d + q + + ; + + ; + + + + i + + i + t + + t + + t + f + s + + + + w + + w + v + + a + + a + + s + + s + g + e + z + : + + + + l + + l + i + + a + + a + + b + + b + d + + e + + e + a + + l + + l + + l + + l + i + + e + + e + a + o + + d + + d + l + g + + + + + + + ( + ' + \ + ; + { + + O + + O + Q + + R + + R + + A + + A + + N + + N + + G + + G + C + + E + + E + + + + + + M + + M + O + D + N + I + V + + A + + A + S + E + a + O + I + + R + + R + E + B + K + A + k + + M + + M + N + O + I + W + + A + + A + S + C + O + G + E + + L + + L + I + E + l + U + . + + A + + A + S + O + I + G + + D + + D + P + N + V + H + U + + E + + E + F + P + B + R + K + + + + + ' + ; + , + + , + + , + + + + b + + b + o + h + v + d + + u + + u + + t + + t + + + + t + + t + + o + + o + + + + h + + h + b + + e + + e + + r + + r + y + + + + g + + g + s + z + + r + + r + + e + + e + + a + + a + + t + + t + r + + + + d + + d + c + + i + + i + + s + + s + g + e + + a + + a + e + + p + + p + y + n + a + + p + + p + + o + + o + a + + i + + i + l + t + + n + + n + + t + + t + i + r + : + - + + m + + m + + e + + e + + n + + n + m + + t + + t + + + + i + + i + + t + + t + + + + w + + w + v + + a + + a + + s + + s + g + e + + + + e + + e + + m + + m + + p + + p + + t + + t + + y + + y + v + + : + + : + ; + + + + s + + s + + h + + h + + e + + e + + + + d + + d + + i + + i + + d + + d + + + + n + + n + m + r + p + u + + o + + o + a + + t + + t + k + + + + + + l + + l + i + I + f + t + L + + i + + i + l + t + ' + I + k + + k + + k + x + c + e + t + s + + e + + e + c + a + é + s + g + + + + t + + t + w + c + - + r + i + + o + + o + a + e + 0 + c + g + + + + d + + d + c + e + g + l + + r + + r + t + v + + o + + o + a + u + e + n + c + + p + + p + n + g + y + + + + t + + t + c + r + e + s + f + + h + + h + b + n + + e + + e + . + a + + + + j + + j + J + f + s + i + y + + a + + a + + r + + r + y + , + + + + f + + f + t + + o + + o + + r + + r + + + + f + + f + t + + e + + e + a + c + + a + + a + e + . + + r + + r + y + , + x + i + + + + o + + o + a + 0 + c + + f + + f + t + + + + + k + + k + K + x + i + + i + + i + I + + l + + l + I + . + f + + l + + l + i + f + + i + + i + l + + n + + n + u + r + + g + + g + s + e + ¢ + , + : + + + + s + + s + e + c + g + + o + + o + a + e + + m + + m + w + n + + e + + e + c + a + o + + b + + b + h + o + + o + + o + a + + d + + d + + y + + y + v + + , + + , + . + + + + s + + s + + o + + o + c + + + + m + + m + + a + + a + + n + + n + m + h + p + + a + + a + g + + g + + g + + e + + e + + d + + d + + + + t + + t + f + + o + + o + 0 + O + + + + p + + p + + u + + u + + t + + t + + + + i + + i + + t + + t + ¢ + f + + + + i + + i + + n + + n + m + + t + + t + + o + + o + + + + + + o + + o + a + e + 0 + c + O + + n + + n + m + v + p + r + i + + e + + e + é + . + g + - + c + + + + o + + o + 0 + a + O + e + + f + + f + t + , + i + + + + t + + t + c + r + + h + + h + n + + e + + e + c + a + + + + c + + c + C + e + t + o + + u + + u + y + w + v + n + + p + + p + y + n + b + + b + + b + h + o + + o + + o + a + + a + + a + + r + + r + t + v + i + e + + d + + d + g + q + u + c + + s + + s + e + + + + a + + a + + s + + s + g + z + e + c + x + + + + s + + s + g + e + + h + + h + b + + e + + e + g + + + + f + + f + + e + + e + c + é + g + . + + l + + l + I + + l + + l + ] + 1 + ! + I + + + + p + + p + y + + a + + a + . + + s + + s + e + a + c + + t + + t + e + + + + i + + i + I + + t + + t + k + + . + + . + , + - + + + +

+ +

+ + + + + + + " + ' + * + + + W + + W + V + w + N + M + + e + + e + é + . + E + a + r + + l + + l + i + I + 1 + M + ( + + l + + l + i + d + ! + t + f + + ! + + ! + l + t + i + ' + + + + + + + ? + ' + " + * + + + + t + + t + c + r + - + e + s + + h + + h + b + n + + o + + o + a + + u + + u + w + v + n + r + y + + g + + g + + h + + h + + t + + t + r + i + e + + + + A + + A + + l + + l + _ + + i + + i + l + é + t + I + s + + c + + c + e + + e + + e + c + + + + t + + t + c + r + f + e + + o + + o + a + e + + + + h + + h + + e + + e + + r + + r + + s + + s + e + c + + e + + e + . + + l + + l + i + + f + + f + t + + , + + , + . + ; + + + + + + + ' + S + + + \ + + a + + a + + f + + f + t + + t + + t + i + + e + + e + c + + r + + r + , + y + + + + s + + s + e + + u + + u + + c + + c + e + + h + + h + n + b + + + + a + + a + + + + f + + f + t + i + + a + + a + e + d + + l + + l + i + + l + + l + ] + 1 + I + i + | + + + + a + + a + + s + + s + g + e + + + + t + + t + c + r + + h + + h + + i + + i + + s + + s + g + + , + + , + + + + I + + I + + + + s + + s + g + + h + + h + b + n + + a + + a + + l + + l + + l + + l + ] + 1 + I + | + ! + + + + t + + t + + h + + h + + i + + i + + n + + n + + k + + k + + + + + + n + + n + m + h + N + p + i + + o + + o + e + a + c + 0 + O + + t + + t + c + - + r + e + v + + h + + h + b + n + l + + i + + i + l + t + I + + n + + n + o + i + + g + + g + e + s + + + + o + + o + 0 + + f + + f + t + i + , + + + + t + + t + r + - + f + + u + + u + y + + m + + m + n + w + o + a + + b + + b + h + D + l + o + + l + + l + i + + i + + i + l + + n + + n + m + h + g + + g + + g + e + ¢ + s + c + , + + + + d + + d + c + g + e + a + l + + o + + o + a + + w + + w + + n + + n + m + h + » + o + + + + s + + s + e + + t + + t + k + c + s + + a + + a + g + s + d + z + . + + i + + i + I + a + + r + + r + ' + + s + + s + g + e + c + + ! + + ! + | + ' + l + : + } + + + + H + + H + + o + + o + + w + + w + + + + b + + b + h + p + D + o + d + + r + + r + + a + + a + + v + + v + w + y + + e + + e + + + + t + + t + c + r + e + + h + + h + + e + + e + + y + + y + v + + + + + ' + + " + 7 + + l + + l + 1 + | + ! + I + ] + + l + + l + ] + + + + a + + a + e + + l + + l + ] + + l + + l + + + + t + + t + + h + + h + + i + + i + + n + + n + + k + + k + + + + m + + m + n + + e + + e + é + + + + a + + a + + t + + t + + + + h + + h + b + + o + + o + + m + + m + n + + e + + e + + ! + + ! + ' + l + + + + + + W + + W + V + N + M + w + U + + h + + h + b + l + k + n + o + + y + + y + r + i + v + j + ) + + , + + , + . + + + + I + + I + | + [ + T + l + + + + w + + w + v + m + + o + + o + + u + + u + w + n + a + m + r + + l + + l + + d + + d + + n + + n + h + m + y + o + u + + + + + ' + + " + ? + + t + + t + e + + + + s + + s + g + e + + a + + a + + y + + y + v + + + + a + + a + e + + n + + n + m + + y + + y + v + w + + t + + t + + h + + h + b + + i + + i + + n + + n + + g + + g + + + + a + + a + e + + b + + b + h + o + + o + + o + + u + + u + w + + t + + t + + + + i + + i + I + j + l + + t + + t + + , + + , + . + + + + e + + e + + v + + v + w + + e + + e + a + + n + + n + m + + + + i + + i + I + + f + + f + t + + + + I + + I + l + [ + 1 + | + T + + + + f + + f + t + + e + + e + + l + + l + + l + + l + + + + o + + o + 0 + a + + f + + f + t + + f + + f + t + + + + + t + + t + + h + + h + + e + + e + + + + t + + t + + o + + o + + p + + p + y + + + + o + + o + + f + + f + t + + + + t + + t + + h + + h + + e + + e + + + + h + + h + b + n + + o + + o + + u + + u + + s + + s + e + + e + + e + + ! + + ! + l + | + + } + ' + + + + + + + + +

+
+
+ + From cbc78faad217d01db002a95159b4f7fad4254586 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Fri, 9 Aug 2019 13:33:17 +0200 Subject: [PATCH 13/15] README update Add information for hocr-simplify --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index c81d9bb..b7cbed2 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ * [hocr-lines](#hocr-lines) -- extract the text within all the ocr_line elements * [hocr-merge-dc](#hocr-merge-dc) -- merge Dublin Core meta data into the hOCR HTML header * [hocr-pdf](#hocr-pdf) -- create a searchable PDF from a pile of hOCR and JPEG + * [hocr-simplify](#hocr-simplify) -- compute an simplified hOCR file * [hocr-split](#hocr-split) -- split an hOCR file into individual pages * [hocr-wordfreq](#hocr-wordfreq) -- calculate word frequency in an hOCR file * [Unit tests](#unit-tests) @@ -207,6 +208,21 @@ hocr-pdf --savefile out.pdf Create a searchable PDF from a pile of hOCR and JPEG. It is important that the corresponding JPEG and hOCR files have the same name with their respective file ending. All of these files should lie in one directory, which one has to specify as an argument when calling the command, e.g. use `hocr-pdf . > out.pdf` to run the command in the current directory and save the output as `out.pdf` alternatively `hocr-pdf . --savefile out.pdf` which avoids routing the output through the terminal. +### hocr-simplify + +``` +hocr-simplify [-t TYPESETTING] [-a REMOVE-ATTRIBUTES] [-c REMOVE-CHOICES] [-e REMOVE-EMPTY-CONTENTS] [-p REMOVE-PROPERTIES] input.html [output.html] +``` + +Compute a simplified hOCR file. If called wihtout any output path, the result is printed to the terminal. +Use: +`-t` to set a new typesetting level, lower ones will be removed, e.g. `-t page` +`-a` to remove attributes, it will be applied to all typesetting levels, e.g. `-a id` +`-c` to remove character choices +`-e` to remove any text content containing only whitespaces or nothing +`-p` to remove properties, e.g. `-p baseline` + + ### hocr-split ``` From a050fbca53488ba2a6c1337d290d6ddfe3fa568d Mon Sep 17 00:00:00 2001 From: JKamlah Date: Fri, 9 Aug 2019 13:36:55 +0200 Subject: [PATCH 14/15] README add EOL --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b7cbed2..60605e8 100644 --- a/README.md +++ b/README.md @@ -214,12 +214,12 @@ Create a searchable PDF from a pile of hOCR and JPEG. It is important that the c hocr-simplify [-t TYPESETTING] [-a REMOVE-ATTRIBUTES] [-c REMOVE-CHOICES] [-e REMOVE-EMPTY-CONTENTS] [-p REMOVE-PROPERTIES] input.html [output.html] ``` -Compute a simplified hOCR file. If called wihtout any output path, the result is printed to the terminal. -Use: -`-t` to set a new typesetting level, lower ones will be removed, e.g. `-t page` -`-a` to remove attributes, it will be applied to all typesetting levels, e.g. `-a id` -`-c` to remove character choices -`-e` to remove any text content containing only whitespaces or nothing +Compute a simplified hOCR file. If called wihtout any output path, the result is printed to the terminal. +Use: +`-t` to set a new typesetting level, lower ones will be removed, e.g. `-t page` +`-a` to remove attributes, it will be applied to all typesetting levels, e.g. `-a id` +`-c` to remove character choices +`-e` to remove any text content containing only whitespaces or nothing `-p` to remove properties, e.g. `-p baseline` From 6a4e4ff8f0be00fac7cd4a3fe8ee368218419898 Mon Sep 17 00:00:00 2001 From: JKamlah Date: Thu, 15 Aug 2019 11:05:54 +0200 Subject: [PATCH 15/15] README DEL two ws. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 60605e8..f7a945b 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,7 @@ hocr-simplify [-t TYPESETTING] [-a REMOVE-ATTRIBUTES] [-c REMOVE-CHOICES] [-e RE Compute a simplified hOCR file. If called wihtout any output path, the result is printed to the terminal. Use: -`-t` to set a new typesetting level, lower ones will be removed, e.g. `-t page` +`-t` to set a new typesetting level, lower ones will be removed, e.g. `-t page` `-a` to remove attributes, it will be applied to all typesetting levels, e.g. `-a id` `-c` to remove character choices `-e` to remove any text content containing only whitespaces or nothing