Skip to content
This repository has been archived by the owner on Oct 3, 2022. It is now read-only.

Port to python3 #41

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

PYTHON = python
PYTHON = python3

PREFIX = /usr/local
DESTDIR =
Expand Down
3 changes: 2 additions & 1 deletion djvu2hocr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# encoding=UTF-8

# Copyright © 2009-2018 Jakub Wilk <[email protected]>
Expand All @@ -14,6 +14,7 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
import sys

basedir = None
Expand Down
2 changes: 2 additions & 0 deletions doc/dependencies
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ The following software is needed to run ocrodjvu:

* python-djvulibre_

* python-regex

* subprocess32_

* lxml_ ≥ 2.0
Expand Down
3 changes: 2 additions & 1 deletion hocr2djvused
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# encoding=UTF-8

# Copyright © 2008-2018 Jakub Wilk <[email protected]>
Expand All @@ -14,6 +14,7 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
import sys

basedir = None
Expand Down
5 changes: 4 additions & 1 deletion lib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from __future__ import unicode_literals
import sys

if sys.version_info < (2, 7): # no coverage
raise RuntimeError('Python 2.7 is required')
if sys.version_info >= (3, 0): # no coverage
elif sys.version_info >= (3, 3): # no coverage
pass
else:
raise RuntimeError('Python 2.X is required')

# vim:ts=4 sts=4 sw=4 et
1 change: 1 addition & 0 deletions lib/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
from .. import errors
from .. import utils

Expand Down
43 changes: 25 additions & 18 deletions lib/cli/djvu2hocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,16 @@
# for more details.

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

from builtins import str
from builtins import map
from builtins import range
from past.utils import old_div
from builtins import object
import argparse
import cgi
import html
import locale
import os
import re
Expand Down Expand Up @@ -99,7 +106,7 @@ def text(self):
raise TypeError('list of {0} (!= 6) elements'.format(len(self._sexpr))) # no coverage
if not isinstance(self._sexpr[5], sexpr.StringExpression):
raise TypeError('last element is not a string') # no coverage
return unicode(self._sexpr[5].value, 'UTF-8', 'replace')
return self._sexpr[5].value

@property
def children(self):
Expand Down Expand Up @@ -153,9 +160,9 @@ def break_chars(char_zone_list, options):
continue
for i, char in enumerate(char_text):
subbox = text_zones.BBox(
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(char_text) + 0.5),
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(char_text)) + 0.5),
bbox.y0,
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * (i + 1) / len(char_text) + 0.5),
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * (i + 1), len(char_text)) + 0.5),
bbox.y1,
)
bbox_list += [subbox]
Expand All @@ -172,7 +179,7 @@ def break_chars(char_zone_list, options):
i = j
continue
bbox = text_zones.BBox()
for k in xrange(i, j):
for k in range(i, j):
bbox.update(bbox_list[k])
element = etree.Element('span')
element.set('class', 'ocrx_word')
Expand All @@ -196,9 +203,9 @@ def break_plain_text(text, bbox, options):
i = j
continue
subbox = text_zones.BBox(
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(text) + 0.5),
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(text)) + 0.5),
bbox.y0,
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * j / len(text) + 0.5),
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * j, len(text)) + 0.5),
bbox.y1,
)
element = etree.Element('span')
Expand Down Expand Up @@ -244,7 +251,7 @@ def process_zone(parent, zone, last, options):
if child is not None and zone_type == const.TEXT_ZONE_WORD and not last:
child.tail = ' '
self = None
elif isinstance(child_zone, unicode):
elif isinstance(child_zone, str):
text = child_zone
if zone_type >= const.TEXT_ZONE_WORD and options.icu is not None and parent is not None:
# Do word segmentation by hand.
Expand All @@ -267,7 +274,7 @@ def process_zone(parent, zone, last, options):
def process_page(page_text, options):
result = process_zone(None, page_text, last=True, options=options)
tree = etree.ElementTree(result)
tree.write(sys.stdout, encoding='UTF-8')
tree.write(sys.stdout.buffer)

hocr_header_template = '''\
<?xml version="1.0" encoding="UTF-8"?>
Expand All @@ -290,9 +297,9 @@ def process_page(page_text, options):
</html>
'''

def main(argv=sys.argv):
def main(argv=[os.fsencode(arg) for arg in sys.argv]):
options = ArgumentParser().parse_args(argv[1:])
logger.info('Converting {path}:'.format(path=utils.smart_repr(options.path, system_encoding)))
logger.info('Converting {path}:'.format(path=options.path))
if options.pages is None:
djvused = ipc.Subprocess(
['djvused', '-e', 'n', os.path.abspath(options.path)],
Expand All @@ -302,9 +309,9 @@ def main(argv=sys.argv):
n_pages = int(djvused.stdout.readline())
finally:
djvused.wait()
options.pages = xrange(1, n_pages + 1)
options.pages = range(1, n_pages + 1)
page_iterator = iter(options.pages)
sed_script = temporary.file(suffix='.djvused')
sed_script = temporary.file(suffix='.djvused', mode='w+',encoding='UTF-8')
for n in options.pages:
print('select {0}; size; print-txt'.format(n), file=sed_script)
sed_script.flush()
Expand All @@ -316,17 +323,17 @@ def main(argv=sys.argv):
hocr_header = hocr_header_template.format(
ocr_system=ocr_system,
ocr_capabilities=' '.join(hocr.djvu2hocr_capabilities),
title=cgi.escape(options.title),
css=cgi.escape(options.css),
title=html.escape(options.title),
css=html.escape(options.css),
)
if not options.css:
hocr_header = re.sub(hocr_header_style_re, '', hocr_header, count=1)
sys.stdout.write(hocr_header)
sys.stdout.buffer.write(hocr_header.encode('UTF-8'))
for n in page_iterator:
try:
page_size = [
int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1])
for i in xrange(2)
for i in range(2)
]
options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1])
page_text = sexpr.Expression.from_stream(djvused.stdout)
Expand All @@ -335,7 +342,7 @@ def main(argv=sys.argv):
logger.info('- Page #{n}'.format(n=n))
page_zone = Zone(page_text, page_size[1])
process_page(page_zone, options)
sys.stdout.write(hocr_footer)
sys.stdout.buffer.write(hocr_footer.encode('UTF-8'))
djvused.wait()

# vim:ts=4 sts=4 sw=4 et
4 changes: 3 additions & 1 deletion lib/cli/hocr2djvused.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
from builtins import map
import argparse
import sys

Expand All @@ -36,7 +38,7 @@ def __init__(self):
self.add_argument('--version', action=version.VersionAction)
self.add_argument('--rotation', dest='rotation', action='store', type=int, default=0, help='page rotation (in degrees)')
def size(s):
return map(int, s.split('x', 1))
return list(map(int, s.split('x', 1)))
self.add_argument('--page-size', metavar='WxH', dest='page_size', action='store', type=size, default=None, help='page size (in pixels)')
group = self.add_argument_group(title='word segmentation options')
group.add_argument('-t', '--details', dest='details', choices=('lines', 'words', 'chars'), action='store', default='words', help='amount of text details to extract')
Expand Down
20 changes: 13 additions & 7 deletions lib/cli/ocrodjvu.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
# for more details.

from __future__ import print_function
from __future__ import unicode_literals

from future import standard_library
standard_library.install_aliases()
from builtins import str
from builtins import range
from builtins import object
import argparse
import contextlib
import inspect
Expand Down Expand Up @@ -258,7 +264,7 @@ def __init__(self):
self.add_argument('--list-engines', action=self.list_engines, nargs=0, help='print list of available OCR engines')
self.add_argument('-l', '--language', dest='language', help='set recognition language')
self.add_argument('--list-languages', action=self.list_languages, nargs=0, help='print list of available languages')
self.add_argument('--render', dest='render_layers', choices=self._render_map.keys(), action='store', default='mask', help='image layers to render')
self.add_argument('--render', dest='render_layers', choices=list(self._render_map.keys()), action='store', default='mask', help='image layers to render')
def pages(x):
return utils.parse_page_numbers(x)
self.add_argument('-p', '--pages', dest='pages', action='store', default=None, type=pages, help='pages to process')
Expand Down Expand Up @@ -400,9 +406,9 @@ def init(self, options):
bpp = 24 if self._options.render_layers != djvu.decode.RENDER_MASK_ONLY else 1
self._image_format = self._options.engine.image_format(bpp)

def _temp_file(self, name, auto_remove=True):
def _temp_file(self, name, mode='w+', encoding=locale.getpreferredencoding(),auto_remove=True):
path = os.path.join(self._temp_dir, name)
file = open(path, 'w+b')
file = open(path,mode=mode,encoding=encoding)
if not self._debug and auto_remove:
file = temporary.wrapper(file, file.name)
return file
Expand All @@ -417,7 +423,7 @@ def get_output_image(self, nth, page_job):
file = self._temp_file('{n:06}.{ext}'.format(
n=nth,
ext=output_format.extension
))
),mode='wb',encoding=None)
try:
output_format.write_image(page_job, self._options.render_layers, file)
file.flush()
Expand Down Expand Up @@ -510,7 +516,7 @@ def page_thread(self, pages, results, condition):

def _process(self, path, pages=None):
self._engine = self._options.engine
logger.info('Processing {path}:'.format(path=utils.smart_repr(path, system_encoding)))
logger.info('Processing {path}:'.format(path=path))
document = self.new_document(djvu.decode.FileURI(path))
document.decoding_job.wait()
if pages is None:
Expand All @@ -524,7 +530,7 @@ def _process(self, path, pages=None):
condition = threading.Condition()
threads = [
threading.Thread(target=self.page_thread, args=(pages, results, condition))
for i in xrange(njobs)
for i in range(njobs)
]
def stop_threads():
with condition:
Expand All @@ -540,7 +546,7 @@ def stop_threads():
sed_file.write('remove-txt\n')
for page in pages:
try:
file_id = page.file.id.encode(system_encoding)
file_id = page.file.id
except UnicodeError:
pageno = page.n + 1
logger.warning('warning: cannot convert page {n} identifier to locale encoding'.format(n=pageno))
Expand Down
1 change: 1 addition & 0 deletions lib/engines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
import pkgutil

def get_engines():
Expand Down
7 changes: 5 additions & 2 deletions lib/engines/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
from builtins import str
from builtins import object
from .. import utils
from .. import image_io

Expand All @@ -33,7 +36,7 @@ def __init__(self, *args, **kwargs):
raise TypeError('{tp}.name must be a string'.format(tp=tpname)) # no coverage
if not issubclass(self.image_format, image_io.ImageFormat):
raise TypeError('{tp}.image_format must be an ImageFormat subclass'.format(tp=tpname)) # no coverage
for key, value in kwargs.iteritems():
for key, value in kwargs.items():
try:
prop = getattr(type(self), key)
if not isinstance(prop, utils.property):
Expand Down Expand Up @@ -63,6 +66,6 @@ def save(self, prefix):
file.write(str(self))

def as_stringio(self):
return io.BytesIO(str(self))
return io.StringIO(str(self))

# vim:ts=4 sts=4 sw=4 et
5 changes: 5 additions & 0 deletions lib/engines/cuneiform.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
import os
import re
import shlex
import warnings
import locale
import sys
import codecs

from . import common
from .. import errors
Expand Down Expand Up @@ -62,6 +66,7 @@ def _get_languages(self):
)
except OSError:
raise errors.UnknownLanguageList
cuneiform.stdout=codecs.getreader(sys.stdout.encoding or locale.getpreferredencoding())(cuneiform.stdout)
self._cuneiform_to_iso = {}
self._user_to_cuneiform = {}
try:
Expand Down
1 change: 1 addition & 0 deletions lib/engines/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.

from __future__ import unicode_literals
from . import common
from .. import image_io
from .. import text_zones
Expand Down
3 changes: 3 additions & 0 deletions lib/engines/gocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
# for more details.

from __future__ import division
from __future__ import unicode_literals

from builtins import map
from builtins import object
import functools
import re
import shlex
Expand Down
Loading