jwilk-archive · bastien-roucaries · Aug 6, 2021 · Sep 9, 2022 · Sep 9, 2022 · Oct 1, 2022
diff --git a/Makefile b/Makefile
@@ -11,7 +11,7 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
-PYTHON = python
+PYTHON = python3
 
 PREFIX = /usr/local
 DESTDIR =

diff --git a/djvu2hocr b/djvu2hocr
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # encoding=UTF-8
 
 # Copyright © 2009-2018 Jakub Wilk <[email protected]>
@@ -14,6 +14,7 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
 import sys
 
 basedir = None

diff --git a/doc/dependencies b/doc/dependencies
@@ -14,6 +14,8 @@ The following software is needed to run ocrodjvu:
 
 * python-djvulibre_
 
+* python-regex
+
 * subprocess32_
 
 * lxml_ ≥ 2.0

diff --git a/hocr2djvused b/hocr2djvused
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # encoding=UTF-8
 
 # Copyright © 2008-2018 Jakub Wilk <[email protected]>
@@ -14,6 +14,7 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
 import sys
 
 basedir = None

diff --git a/lib/__init__.py b/lib/__init__.py
@@ -1,8 +1,11 @@
+from __future__ import unicode_literals
 import sys
 
 if sys.version_info < (2, 7):  # no coverage
     raise RuntimeError('Python 2.7 is required')
-if sys.version_info >= (3, 0):  # no coverage
+elif sys.version_info >= (3, 3):  # no coverage
+    pass
+else:
     raise RuntimeError('Python 2.X is required')
 
 # vim:ts=4 sts=4 sw=4 et
diff --git a/lib/cli/__init__.py b/lib/cli/__init__.py
@@ -13,6 +13,7 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
 from .. import errors
 from .. import utils
 

diff --git a/lib/cli/djvu2hocr.py b/lib/cli/djvu2hocr.py
@@ -14,9 +14,16 @@
 # for more details.
 
 from __future__ import print_function
+from __future__ import division
+from __future__ import unicode_literals
 
+from builtins import str
+from builtins import map
+from builtins import range
+from past.utils import old_div
+from builtins import object
 import argparse
-import cgi
+import html
 import locale
 import os
 import re
@@ -99,7 +106,7 @@ def text(self):
             raise TypeError('list of {0} (!= 6) elements'.format(len(self._sexpr)))  # no coverage
         if not isinstance(self._sexpr[5], sexpr.StringExpression):
             raise TypeError('last element is not a string')  # no coverage
-        return unicode(self._sexpr[5].value, 'UTF-8', 'replace')
+        return self._sexpr[5].value
 
     @property
     def children(self):
@@ -153,9 +160,9 @@ def break_chars(char_zone_list, options):
             continue
         for i, char in enumerate(char_text):
             subbox = text_zones.BBox(
-                int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(char_text) + 0.5),
+                int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(char_text)) + 0.5),
                 bbox.y0,
-                int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * (i + 1) / len(char_text) + 0.5),
+                int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * (i + 1), len(char_text)) + 0.5),
                 bbox.y1,
             )
             bbox_list += [subbox]
@@ -172,7 +179,7 @@ def break_chars(char_zone_list, options):
             i = j
             continue
         bbox = text_zones.BBox()
-        for k in xrange(i, j):
+        for k in range(i, j):
             bbox.update(bbox_list[k])
         element = etree.Element('span')
         element.set('class', 'ocrx_word')
@@ -196,9 +203,9 @@ def break_plain_text(text, bbox, options):
             i = j
             continue
         subbox = text_zones.BBox(
-            int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(text) + 0.5),
+            int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(text)) + 0.5),
             bbox.y0,
-            int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * j / len(text) + 0.5),
+            int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * j, len(text)) + 0.5),
             bbox.y1,
         )
         element = etree.Element('span')
@@ -244,7 +251,7 @@ def process_zone(parent, zone, last, options):
         if child is not None and zone_type == const.TEXT_ZONE_WORD and not last:
             child.tail = ' '
         self = None
-    elif isinstance(child_zone, unicode):
+    elif isinstance(child_zone, str):
         text = child_zone
         if zone_type >= const.TEXT_ZONE_WORD and options.icu is not None and parent is not None:
             # Do word segmentation by hand.
@@ -267,7 +274,7 @@ def process_zone(parent, zone, last, options):
 def process_page(page_text, options):
     result = process_zone(None, page_text, last=True, options=options)
     tree = etree.ElementTree(result)
-    tree.write(sys.stdout, encoding='UTF-8')
+    tree.write(sys.stdout.buffer)
 
 hocr_header_template = '''\
 <?xml version="1.0" encoding="UTF-8"?>
@@ -290,9 +297,9 @@ def process_page(page_text, options):
 </html>
 '''
 
-def main(argv=sys.argv):
+def main(argv=[os.fsencode(arg) for arg in sys.argv]):
     options = ArgumentParser().parse_args(argv[1:])
-    logger.info('Converting {path}:'.format(path=utils.smart_repr(options.path, system_encoding)))
+    logger.info('Converting {path}:'.format(path=options.path))
     if options.pages is None:
         djvused = ipc.Subprocess(
             ['djvused', '-e', 'n', os.path.abspath(options.path)],
@@ -302,9 +309,9 @@ def main(argv=sys.argv):
             n_pages = int(djvused.stdout.readline())
         finally:
             djvused.wait()
-        options.pages = xrange(1, n_pages + 1)
+        options.pages = range(1, n_pages + 1)
     page_iterator = iter(options.pages)
-    sed_script = temporary.file(suffix='.djvused')
+    sed_script = temporary.file(suffix='.djvused', mode='w+',encoding='UTF-8')
     for n in options.pages:
         print('select {0}; size; print-txt'.format(n), file=sed_script)
     sed_script.flush()
@@ -316,17 +323,17 @@ def main(argv=sys.argv):
     hocr_header = hocr_header_template.format(
         ocr_system=ocr_system,
         ocr_capabilities=' '.join(hocr.djvu2hocr_capabilities),
-        title=cgi.escape(options.title),
-        css=cgi.escape(options.css),
+        title=html.escape(options.title),
+        css=html.escape(options.css),
     )
     if not options.css:
         hocr_header = re.sub(hocr_header_style_re, '', hocr_header, count=1)
-    sys.stdout.write(hocr_header)
+    sys.stdout.buffer.write(hocr_header.encode('UTF-8'))
     for n in page_iterator:
         try:
             page_size = [
                 int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1])
-                for i in xrange(2)
+                for i in range(2)
             ]
             options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1])
             page_text = sexpr.Expression.from_stream(djvused.stdout)
@@ -335,7 +342,7 @@ def main(argv=sys.argv):
         logger.info('- Page #{n}'.format(n=n))
         page_zone = Zone(page_text, page_size[1])
         process_page(page_zone, options)
-    sys.stdout.write(hocr_footer)
+    sys.stdout.buffer.write(hocr_footer.encode('UTF-8'))
     djvused.wait()
 
 # vim:ts=4 sts=4 sw=4 et
diff --git a/lib/cli/hocr2djvused.py b/lib/cli/hocr2djvused.py
@@ -13,6 +13,8 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
+from builtins import map
 import argparse
 import sys
 
@@ -36,7 +38,7 @@ def __init__(self):
         self.add_argument('--version', action=version.VersionAction)
         self.add_argument('--rotation', dest='rotation', action='store', type=int, default=0, help='page rotation (in degrees)')
         def size(s):
-            return map(int, s.split('x', 1))
+            return list(map(int, s.split('x', 1)))
         self.add_argument('--page-size', metavar='WxH', dest='page_size', action='store', type=size, default=None, help='page size (in pixels)')
         group = self.add_argument_group(title='word segmentation options')
         group.add_argument('-t', '--details', dest='details', choices=('lines', 'words', 'chars'), action='store', default='words', help='amount of text details to extract')

diff --git a/lib/cli/ocrodjvu.py b/lib/cli/ocrodjvu.py
@@ -14,7 +14,13 @@
 # for more details.
 
 from __future__ import print_function
+from __future__ import unicode_literals
 
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+from builtins import range
+from builtins import object
 import argparse
 import contextlib
 import inspect
@@ -258,7 +264,7 @@ def __init__(self):
         self.add_argument('--list-engines', action=self.list_engines, nargs=0, help='print list of available OCR engines')
         self.add_argument('-l', '--language', dest='language', help='set recognition language')
         self.add_argument('--list-languages', action=self.list_languages, nargs=0, help='print list of available languages')
-        self.add_argument('--render', dest='render_layers', choices=self._render_map.keys(), action='store', default='mask', help='image layers to render')
+        self.add_argument('--render', dest='render_layers', choices=list(self._render_map.keys()), action='store', default='mask', help='image layers to render')
         def pages(x):
             return utils.parse_page_numbers(x)
         self.add_argument('-p', '--pages', dest='pages', action='store', default=None, type=pages, help='pages to process')
@@ -400,9 +406,9 @@ def init(self, options):
         bpp = 24 if self._options.render_layers != djvu.decode.RENDER_MASK_ONLY else 1
         self._image_format = self._options.engine.image_format(bpp)
 
-    def _temp_file(self, name, auto_remove=True):
+    def _temp_file(self, name, mode='w+', encoding=locale.getpreferredencoding(),auto_remove=True):
         path = os.path.join(self._temp_dir, name)
-        file = open(path, 'w+b')
+        file = open(path,mode=mode,encoding=encoding)
         if not self._debug and auto_remove:
             file = temporary.wrapper(file, file.name)
         return file
@@ -417,7 +423,7 @@ def get_output_image(self, nth, page_job):
         file = self._temp_file('{n:06}.{ext}'.format(
             n=nth,
             ext=output_format.extension
-        ))
+        ),mode='wb',encoding=None)
         try:
             output_format.write_image(page_job, self._options.render_layers, file)
             file.flush()
@@ -510,7 +516,7 @@ def page_thread(self, pages, results, condition):
 
     def _process(self, path, pages=None):
         self._engine = self._options.engine
-        logger.info('Processing {path}:'.format(path=utils.smart_repr(path, system_encoding)))
+        logger.info('Processing {path}:'.format(path=path))
         document = self.new_document(djvu.decode.FileURI(path))
         document.decoding_job.wait()
         if pages is None:
@@ -524,7 +530,7 @@ def _process(self, path, pages=None):
         condition = threading.Condition()
         threads = [
             threading.Thread(target=self.page_thread, args=(pages, results, condition))
-            for i in xrange(njobs)
+            for i in range(njobs)
         ]
         def stop_threads():
             with condition:
@@ -540,7 +546,7 @@ def stop_threads():
                 sed_file.write('remove-txt\n')
             for page in pages:
                 try:
-                    file_id = page.file.id.encode(system_encoding)
+                    file_id = page.file.id
                 except UnicodeError:
                     pageno = page.n + 1
                     logger.warning('warning: cannot convert page {n} identifier to locale encoding'.format(n=pageno))

diff --git a/lib/engines/__init__.py b/lib/engines/__init__.py
@@ -13,6 +13,7 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
 import pkgutil
 
 def get_engines():

diff --git a/lib/engines/common.py b/lib/engines/common.py
@@ -13,6 +13,9 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
+from builtins import str
+from builtins import object
 from .. import utils
 from .. import image_io
 
@@ -33,7 +36,7 @@ def __init__(self, *args, **kwargs):
             raise TypeError('{tp}.name must be a string'.format(tp=tpname))  # no coverage
         if not issubclass(self.image_format, image_io.ImageFormat):
             raise TypeError('{tp}.image_format must be an ImageFormat subclass'.format(tp=tpname))  # no coverage
-        for key, value in kwargs.iteritems():
+        for key, value in kwargs.items():
             try:
                 prop = getattr(type(self), key)
                 if not isinstance(prop, utils.property):
@@ -63,6 +66,6 @@ def save(self, prefix):
             file.write(str(self))
 
     def as_stringio(self):
-        return io.BytesIO(str(self))
+        return io.StringIO(str(self))
 
 # vim:ts=4 sts=4 sw=4 et
diff --git a/lib/engines/cuneiform.py b/lib/engines/cuneiform.py
@@ -13,10 +13,14 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
 import os
 import re
 import shlex
 import warnings
+import locale
+import sys
+import codecs
 
 from . import common
 from .. import errors
@@ -62,6 +66,7 @@ def _get_languages(self):
             )
         except OSError:
             raise errors.UnknownLanguageList
+        cuneiform.stdout=codecs.getreader(sys.stdout.encoding or locale.getpreferredencoding())(cuneiform.stdout)
         self._cuneiform_to_iso = {}
         self._user_to_cuneiform = {}
         try:

diff --git a/lib/engines/dummy.py b/lib/engines/dummy.py
@@ -13,6 +13,7 @@
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 # for more details.
 
+from __future__ import unicode_literals
 from . import common
 from .. import image_io
 from .. import text_zones

diff --git a/lib/engines/gocr.py b/lib/engines/gocr.py
@@ -14,7 +14,10 @@
 # for more details.
 
 from __future__ import division
+from __future__ import unicode_literals
 
+from builtins import map
+from builtins import object
 import functools
 import re
 import shlex