diff --git a/Makefile b/Makefile index 0c4c20d0..c5cebd30 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. -PYTHON = python +PYTHON = python3 PREFIX = /usr/local DESTDIR = diff --git a/djvu2hocr b/djvu2hocr index 9269c550..7f33fbfc 100755 --- a/djvu2hocr +++ b/djvu2hocr @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # encoding=UTF-8 # Copyright © 2009-2018 Jakub Wilk @@ -14,6 +14,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import sys basedir = None diff --git a/doc/dependencies b/doc/dependencies index 3e95bc7f..9e114761 100644 --- a/doc/dependencies +++ b/doc/dependencies @@ -14,6 +14,8 @@ The following software is needed to run ocrodjvu: * python-djvulibre_ +* python-regex + * subprocess32_ * lxml_ ≥ 2.0 diff --git a/hocr2djvused b/hocr2djvused index 5c2e3983..a1e79fb0 100755 --- a/hocr2djvused +++ b/hocr2djvused @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # encoding=UTF-8 # Copyright © 2008-2018 Jakub Wilk @@ -14,6 +14,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import sys basedir = None diff --git a/lib/__init__.py b/lib/__init__.py index 17781005..436e968e 100644 --- a/lib/__init__.py +++ b/lib/__init__.py @@ -1,8 +1,11 @@ +from __future__ import unicode_literals import sys if sys.version_info < (2, 7): # no coverage raise RuntimeError('Python 2.7 is required') -if sys.version_info >= (3, 0): # no coverage +elif sys.version_info >= (3, 3): # no coverage + pass +else: raise RuntimeError('Python 2.X is required') # vim:ts=4 sts=4 sw=4 et diff --git a/lib/cli/__init__.py b/lib/cli/__init__.py index 8eb353bf..9316921a 100644 --- a/lib/cli/__init__.py +++ b/lib/cli/__init__.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals from .. import errors from .. import utils diff --git a/lib/cli/djvu2hocr.py b/lib/cli/djvu2hocr.py index 721ae5b4..6690fd4c 100644 --- a/lib/cli/djvu2hocr.py +++ b/lib/cli/djvu2hocr.py @@ -14,9 +14,16 @@ # for more details. from __future__ import print_function +from __future__ import division +from __future__ import unicode_literals +from builtins import str +from builtins import map +from builtins import range +from past.utils import old_div +from builtins import object import argparse -import cgi +import html import locale import os import re @@ -99,7 +106,7 @@ def text(self): raise TypeError('list of {0} (!= 6) elements'.format(len(self._sexpr))) # no coverage if not isinstance(self._sexpr[5], sexpr.StringExpression): raise TypeError('last element is not a string') # no coverage - return unicode(self._sexpr[5].value, 'UTF-8', 'replace') + return self._sexpr[5].value @property def children(self): @@ -153,9 +160,9 @@ def break_chars(char_zone_list, options): continue for i, char in enumerate(char_text): subbox = text_zones.BBox( - int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(char_text) + 0.5), + int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(char_text)) + 0.5), bbox.y0, - int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * (i + 1) / len(char_text) + 0.5), + int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * (i + 1), len(char_text)) + 0.5), bbox.y1, ) bbox_list += [subbox] @@ -172,7 +179,7 @@ def break_chars(char_zone_list, options): i = j continue bbox = text_zones.BBox() - for k in xrange(i, j): + for k in range(i, j): bbox.update(bbox_list[k]) element = etree.Element('span') element.set('class', 'ocrx_word') @@ -196,9 +203,9 @@ def break_plain_text(text, bbox, options): i = j continue subbox = text_zones.BBox( - int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(text) + 0.5), + int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(text)) + 0.5), bbox.y0, - int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * j / len(text) + 0.5), + int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * j, len(text)) + 0.5), bbox.y1, ) element = etree.Element('span') @@ -244,7 +251,7 @@ def process_zone(parent, zone, last, options): if child is not None and zone_type == const.TEXT_ZONE_WORD and not last: child.tail = ' ' self = None - elif isinstance(child_zone, unicode): + elif isinstance(child_zone, str): text = child_zone if zone_type >= const.TEXT_ZONE_WORD and options.icu is not None and parent is not None: # Do word segmentation by hand. @@ -267,7 +274,7 @@ def process_zone(parent, zone, last, options): def process_page(page_text, options): result = process_zone(None, page_text, last=True, options=options) tree = etree.ElementTree(result) - tree.write(sys.stdout, encoding='UTF-8') + tree.write(sys.stdout.buffer) hocr_header_template = '''\ @@ -290,9 +297,9 @@ def process_page(page_text, options): ''' -def main(argv=sys.argv): +def main(argv=[os.fsencode(arg) for arg in sys.argv]): options = ArgumentParser().parse_args(argv[1:]) - logger.info('Converting {path}:'.format(path=utils.smart_repr(options.path, system_encoding))) + logger.info('Converting {path}:'.format(path=options.path)) if options.pages is None: djvused = ipc.Subprocess( ['djvused', '-e', 'n', os.path.abspath(options.path)], @@ -302,9 +309,9 @@ def main(argv=sys.argv): n_pages = int(djvused.stdout.readline()) finally: djvused.wait() - options.pages = xrange(1, n_pages + 1) + options.pages = range(1, n_pages + 1) page_iterator = iter(options.pages) - sed_script = temporary.file(suffix='.djvused') + sed_script = temporary.file(suffix='.djvused', mode='w+',encoding='UTF-8') for n in options.pages: print('select {0}; size; print-txt'.format(n), file=sed_script) sed_script.flush() @@ -316,17 +323,17 @@ def main(argv=sys.argv): hocr_header = hocr_header_template.format( ocr_system=ocr_system, ocr_capabilities=' '.join(hocr.djvu2hocr_capabilities), - title=cgi.escape(options.title), - css=cgi.escape(options.css), + title=html.escape(options.title), + css=html.escape(options.css), ) if not options.css: hocr_header = re.sub(hocr_header_style_re, '', hocr_header, count=1) - sys.stdout.write(hocr_header) + sys.stdout.buffer.write(hocr_header.encode('UTF-8')) for n in page_iterator: try: page_size = [ int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1]) - for i in xrange(2) + for i in range(2) ] options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1]) page_text = sexpr.Expression.from_stream(djvused.stdout) @@ -335,7 +342,7 @@ def main(argv=sys.argv): logger.info('- Page #{n}'.format(n=n)) page_zone = Zone(page_text, page_size[1]) process_page(page_zone, options) - sys.stdout.write(hocr_footer) + sys.stdout.buffer.write(hocr_footer.encode('UTF-8')) djvused.wait() # vim:ts=4 sts=4 sw=4 et diff --git a/lib/cli/hocr2djvused.py b/lib/cli/hocr2djvused.py index fb015d61..df686055 100644 --- a/lib/cli/hocr2djvused.py +++ b/lib/cli/hocr2djvused.py @@ -13,6 +13,8 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals +from builtins import map import argparse import sys @@ -36,7 +38,7 @@ def __init__(self): self.add_argument('--version', action=version.VersionAction) self.add_argument('--rotation', dest='rotation', action='store', type=int, default=0, help='page rotation (in degrees)') def size(s): - return map(int, s.split('x', 1)) + return list(map(int, s.split('x', 1))) self.add_argument('--page-size', metavar='WxH', dest='page_size', action='store', type=size, default=None, help='page size (in pixels)') group = self.add_argument_group(title='word segmentation options') group.add_argument('-t', '--details', dest='details', choices=('lines', 'words', 'chars'), action='store', default='words', help='amount of text details to extract') diff --git a/lib/cli/ocrodjvu.py b/lib/cli/ocrodjvu.py index 827114a8..f9f00124 100644 --- a/lib/cli/ocrodjvu.py +++ b/lib/cli/ocrodjvu.py @@ -14,7 +14,13 @@ # for more details. from __future__ import print_function +from __future__ import unicode_literals +from future import standard_library +standard_library.install_aliases() +from builtins import str +from builtins import range +from builtins import object import argparse import contextlib import inspect @@ -258,7 +264,7 @@ def __init__(self): self.add_argument('--list-engines', action=self.list_engines, nargs=0, help='print list of available OCR engines') self.add_argument('-l', '--language', dest='language', help='set recognition language') self.add_argument('--list-languages', action=self.list_languages, nargs=0, help='print list of available languages') - self.add_argument('--render', dest='render_layers', choices=self._render_map.keys(), action='store', default='mask', help='image layers to render') + self.add_argument('--render', dest='render_layers', choices=list(self._render_map.keys()), action='store', default='mask', help='image layers to render') def pages(x): return utils.parse_page_numbers(x) self.add_argument('-p', '--pages', dest='pages', action='store', default=None, type=pages, help='pages to process') @@ -400,9 +406,9 @@ def init(self, options): bpp = 24 if self._options.render_layers != djvu.decode.RENDER_MASK_ONLY else 1 self._image_format = self._options.engine.image_format(bpp) - def _temp_file(self, name, auto_remove=True): + def _temp_file(self, name, mode='w+', encoding=locale.getpreferredencoding(),auto_remove=True): path = os.path.join(self._temp_dir, name) - file = open(path, 'w+b') + file = open(path,mode=mode,encoding=encoding) if not self._debug and auto_remove: file = temporary.wrapper(file, file.name) return file @@ -417,7 +423,7 @@ def get_output_image(self, nth, page_job): file = self._temp_file('{n:06}.{ext}'.format( n=nth, ext=output_format.extension - )) + ),mode='wb',encoding=None) try: output_format.write_image(page_job, self._options.render_layers, file) file.flush() @@ -510,7 +516,7 @@ def page_thread(self, pages, results, condition): def _process(self, path, pages=None): self._engine = self._options.engine - logger.info('Processing {path}:'.format(path=utils.smart_repr(path, system_encoding))) + logger.info('Processing {path}:'.format(path=path)) document = self.new_document(djvu.decode.FileURI(path)) document.decoding_job.wait() if pages is None: @@ -524,7 +530,7 @@ def _process(self, path, pages=None): condition = threading.Condition() threads = [ threading.Thread(target=self.page_thread, args=(pages, results, condition)) - for i in xrange(njobs) + for i in range(njobs) ] def stop_threads(): with condition: @@ -540,7 +546,7 @@ def stop_threads(): sed_file.write('remove-txt\n') for page in pages: try: - file_id = page.file.id.encode(system_encoding) + file_id = page.file.id except UnicodeError: pageno = page.n + 1 logger.warning('warning: cannot convert page {n} identifier to locale encoding'.format(n=pageno)) diff --git a/lib/engines/__init__.py b/lib/engines/__init__.py index 43e575a3..07fd4a32 100644 --- a/lib/engines/__init__.py +++ b/lib/engines/__init__.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import pkgutil def get_engines(): diff --git a/lib/engines/common.py b/lib/engines/common.py index 9f5d9265..bcf8af35 100644 --- a/lib/engines/common.py +++ b/lib/engines/common.py @@ -13,6 +13,9 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals +from builtins import str +from builtins import object from .. import utils from .. import image_io @@ -33,7 +36,7 @@ def __init__(self, *args, **kwargs): raise TypeError('{tp}.name must be a string'.format(tp=tpname)) # no coverage if not issubclass(self.image_format, image_io.ImageFormat): raise TypeError('{tp}.image_format must be an ImageFormat subclass'.format(tp=tpname)) # no coverage - for key, value in kwargs.iteritems(): + for key, value in kwargs.items(): try: prop = getattr(type(self), key) if not isinstance(prop, utils.property): @@ -63,6 +66,6 @@ def save(self, prefix): file.write(str(self)) def as_stringio(self): - return io.BytesIO(str(self)) + return io.StringIO(str(self)) # vim:ts=4 sts=4 sw=4 et diff --git a/lib/engines/cuneiform.py b/lib/engines/cuneiform.py index 3f587edf..a9869a5c 100644 --- a/lib/engines/cuneiform.py +++ b/lib/engines/cuneiform.py @@ -13,10 +13,14 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import os import re import shlex import warnings +import locale +import sys +import codecs from . import common from .. import errors @@ -62,6 +66,7 @@ def _get_languages(self): ) except OSError: raise errors.UnknownLanguageList + cuneiform.stdout=codecs.getreader(sys.stdout.encoding or locale.getpreferredencoding())(cuneiform.stdout) self._cuneiform_to_iso = {} self._user_to_cuneiform = {} try: diff --git a/lib/engines/dummy.py b/lib/engines/dummy.py index 5bf20e3d..721a33e5 100644 --- a/lib/engines/dummy.py +++ b/lib/engines/dummy.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals from . import common from .. import image_io from .. import text_zones diff --git a/lib/engines/gocr.py b/lib/engines/gocr.py index 008be82a..7b27e846 100644 --- a/lib/engines/gocr.py +++ b/lib/engines/gocr.py @@ -14,7 +14,10 @@ # for more details. from __future__ import division +from __future__ import unicode_literals +from builtins import map +from builtins import object import functools import re import shlex diff --git a/lib/engines/ocrad.py b/lib/engines/ocrad.py index f58b746f..a5dc6cdd 100644 --- a/lib/engines/ocrad.py +++ b/lib/engines/ocrad.py @@ -13,6 +13,10 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals +from builtins import map +from builtins import range +from builtins import object import functools import re import shlex @@ -56,23 +60,23 @@ def scan(stream, settings): [n] = line.split()[3:] n = int(n) bbox = text_zones.BBox(*((0, 0) + settings.page_size)) - children = filter(None, (scan(stream, settings) for i in xrange(n))) + children = [_f for _f in (scan(stream, settings) for i in range(n)) if _f] zone = text_zones.Zone(const.TEXT_ZONE_PAGE, bbox, children) zone.rotate(settings.rotation) return zone if line.startswith('text block '): - n, x, y, w, h = map(int, line.split()[2:]) + n, x, y, w, h = list(map(int, line.split()[2:])) bbox = text_zones.BBox(x, y, x + w, y + h) - [children] = [scan(stream, settings) for i in xrange(n)] + [children] = [scan(stream, settings) for i in range(n)] return text_zones.Zone(const.TEXT_ZONE_REGION, bbox, children) if line.startswith('lines '): [n] = line.split()[1:] n = int(n) - return filter(None, (scan(stream, settings) for i in xrange(n))) + return [_f for _f in (scan(stream, settings) for i in range(n)) if _f] if line.startswith('line '): _, _, _, n, _, _ = line.split() n = int(n) - children = filter(None, (scan(stream, settings) for i in xrange(n))) + children = [_f for _f in (scan(stream, settings) for i in range(n)) if _f] if not children: return None bbox = text_zones.BBox() @@ -83,7 +87,7 @@ def scan(stream, settings): line = line.lstrip() if line[0].isdigit(): coords, line = line.split('; ', 1) - x, y, w, h = map(int, coords.split()) + x, y, w, h = list(map(int, coords.split())) bbox = text_zones.BBox(x, y, x + w, y + h) if line[0] == '0': # No interpretations have been proposed for this particular character. diff --git a/lib/engines/ocropus.py b/lib/engines/ocropus.py index 9bf401d6..9b674d09 100644 --- a/lib/engines/ocropus.py +++ b/lib/engines/ocropus.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import shlex from . import common diff --git a/lib/engines/tesseract.py b/lib/engines/tesseract.py index 1eb4b52d..bd9c70d7 100644 --- a/lib/engines/tesseract.py +++ b/lib/engines/tesseract.py @@ -14,7 +14,9 @@ # for more details. from __future__ import print_function +from __future__ import unicode_literals +from builtins import object import cgi import glob import os @@ -22,6 +24,8 @@ import shlex import sys import warnings +import locale +import codecs from . import common from .. import errors @@ -50,10 +54,12 @@ ''' def _filter_boring_stderr(stderr): - if stderr and stderr[0].startswith('Tesseract Open Source OCR Engine'): - # Tesseract prints its own name on standard error - # even if nothing went wrong. + if not stderr: + return + if re.match('\ATesseract Open Source OCR Engine',stderr[0]): del stderr[0] + # Tesseract prints its own name on standard error + # even if nothing went wrong if stderr and stderr[0] == 'Page 1': # We also don't want page numbers, # because we always pass just a single page to Tesseract. @@ -139,6 +145,8 @@ def get_filesystem_info(self): ) except OSError: raise errors.UnknownLanguageList + tesseract.stdout=codecs.getreader(sys.stdout.encoding or locale.getpreferredencoding())(tesseract.stdout) + tesseract.stderr=codecs.getreader(sys.stdout.encoding or locale.getpreferredencoding())(tesseract.stderr) try: stderr = tesseract.stderr.read() match = _error_pattern.search(stderr) @@ -211,6 +219,7 @@ def recognize_plain_text(self, image, language, details=None, uax29=None): stdout=ipc.DEVNULL, stderr=ipc.PIPE, ) + worker.stderr=codecs.getreader(sys.stderr.encoding or locale.getpreferredencoding())(worker.stderr) _wait_for_worker(worker) with open(os.path.join(output_dir, 'tmp.txt'), 'rt') as file: return common.Output( @@ -244,6 +253,7 @@ def recognize_hocr(self, image, language, details=text_zones.TEXT_DETAILS_WORD, stdout=ipc.DEVNULL, stderr=ipc.PIPE, ) + worker.stderr=codecs.getreader(sys.stderr.encoding or locale.getpreferredencoding())(worker.stderr) _wait_for_worker(worker) hocr_path = os.path.join(output_dir, 'tmp.hocr') if not os.path.exists(hocr_path): @@ -263,6 +273,7 @@ def recognize_hocr(self, image, language, details=text_zones.TEXT_DETAILS_WORD, stdout=ipc.DEVNULL, stderr=ipc.PIPE, ) + worker.stderr=codecs.getreader(sys.stderr.encoding or locale.getpreferredencoding())(worker.stderr) _wait_for_worker(worker) with open(box_path, 'r') as box_file: contents = contents.replace( diff --git a/lib/errors.py b/lib/errors.py index 2c7a05ea..9b278af7 100644 --- a/lib/errors.py +++ b/lib/errors.py @@ -14,6 +14,7 @@ # for more details. from __future__ import print_function +from __future__ import unicode_literals import argparse import sys diff --git a/lib/hocr.py b/lib/hocr.py index 19e34bba..a399c590 100644 --- a/lib/hocr.py +++ b/lib/hocr.py @@ -19,7 +19,13 @@ The hOCR format specification: http://kba.github.io/hocr-spec/1.2/ ''' +from __future__ import unicode_literals +from builtins import map +from builtins import zip +from builtins import range +from past.builtins import basestring +from builtins import object import functools import re @@ -68,7 +74,7 @@ const.TEXT_ZONE_LINE: ('span', 'ocrx_line'), const.TEXT_ZONE_WORD: ('span', 'ocrx_word'), } -djvu2hocr_capabilities = list(sorted(cls for tag, cls in _djvu_zone_to_hocr.itervalues())) +djvu2hocr_capabilities = list(sorted(cls for tag, cls in _djvu_zone_to_hocr.values())) djvu_zone_to_hocr = _djvu_zone_to_hocr.__getitem__ del _djvu_zone_to_hocr @@ -127,7 +133,7 @@ def _apply_bboxes(djvu_class, bbox_source, text, settings, page_size): if not m: return [text] coordinates = (int(x) for x in m.group(1).replace(',', ' ').split()) - coordinates = zip(coordinates, coordinates, coordinates, coordinates) + coordinates = list(zip(coordinates, coordinates, coordinates, coordinates)) else: # bboxes from an iterator coordinates = [] @@ -164,7 +170,7 @@ def _apply_bboxes(djvu_class, bbox_source, text, settings, page_size): i = j continue bbox = text_zones.BBox() - for k in xrange(i, j): + for k in range(i, j): if settings.cuneiform and coordinates[k] == (-1, -1, -1, -1): raise errors.MalformedHocr("missing bbox for non-whitespace character") bbox.update(text_zones.BBox(*coordinates[k])) @@ -175,7 +181,7 @@ def _apply_bboxes(djvu_class, bbox_source, text, settings, page_size): else: last_word += [ text_zones.Zone(type=const.TEXT_ZONE_CHARACTER, bbox=(x0, y0, x1, y1), children=[ch]) - for k in xrange(i, j) + for k in range(i, j) for (x0, y0, x1, y1), ch in [(coordinates[k], text[k])] ] i = j @@ -407,7 +413,7 @@ def extract_tesseract_bbox_data(node): if not line or line.startswith('//'): continue chars, x0, y0, x1, y1, w = line.split() - x0, y0, x1, y1 = map(int, (x0, y0, x1, y1)) + x0, y0, x1, y1 = list(map(int, (x0, y0, x1, y1))) if chars == '~': chars = [None] w = x1 - x0 @@ -431,7 +437,7 @@ def read_document(stream, settings): # # FIXME: This work-around is ugly and should be dropped at some point. contents = stream.read() - contents = utils.sanitize_utf8(contents) + contents = utils.sanitize_utf8(contents.encode('UTF-8')) if settings.html5: return html5_support.parse(contents) else: diff --git a/lib/html5_support.py b/lib/html5_support.py index cc955aaf..d9a6ae7a 100644 --- a/lib/html5_support.py +++ b/lib/html5_support.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals from . import utils def parse(stream): diff --git a/lib/image_io.py b/lib/image_io.py index 4356e0aa..e95b3325 100644 --- a/lib/image_io.py +++ b/lib/image_io.py @@ -13,6 +13,8 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals +from builtins import object import struct from . import utils @@ -72,9 +74,9 @@ def write_image(self, page_job, render_layers, file): size = page_job.size rect = (0, 0) + size if self._pixel_format.bpp == 1: - file.write('P4 {0} {1}\n'.format(*size)) # PBM header + file.write('P4 {0} {1}\n'.format(*size).encode('ASCII')) # PBM header else: - file.write('P6 {0} {1} 255\n'.format(*size)) # PPM header + file.write('P6 {0} {1} 255\n'.format(*size).encode('ASCII')) # PPM header data = page_job.render( render_layers, rect, rect, @@ -111,7 +113,7 @@ def write_image(self, page_job, render_layers, file): n_palette_colors = 2 * (self._pixel_format.bpp == 1) headers_size = 54 + 4 * n_palette_colors file.write(struct.pack(' @@ -14,6 +14,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import sys basedir = None diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..3c3314f3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +future +lxml +python-djvulibre +regex diff --git a/tests/djvu2hocr/test.py b/tests/djvu2hocr/test.py index 16d109b1..76351eba 100644 --- a/tests/djvu2hocr/test.py +++ b/tests/djvu2hocr/test.py @@ -13,11 +13,13 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import io import os import shlex import shutil import sys +import codecs from lib import ipc from lib import errors @@ -39,8 +41,8 @@ here = os.path.relpath(here) def test_help(): - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(djvu2hocr.main, ['', '--help']) assert_equal(stderr.getvalue(), '') @@ -48,8 +50,8 @@ def test_help(): assert_not_equal(stdout.getvalue(), '') def test_bad_options(): - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(djvu2hocr.main, ['']) assert_equal(rc, errors.EXIT_FATAL) @@ -58,8 +60,8 @@ def test_bad_options(): def test_version(): # https://bugs.debian.org/573496 - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(djvu2hocr.main, ['', '--version']) assert_equal(stderr.getvalue(), '') @@ -70,7 +72,7 @@ def _test_from_file(base_filename, index): base_filename = os.path.join(here, base_filename) test_filename = '{base}.test{i}'.format(base=base_filename, i=index) djvused_filename = base_filename + '.djvused' - with open(test_filename, 'rb') as file: + with open(test_filename, 'r') as file: commandline = file.readline() expected_output = file.read() args = shlex.split(commandline) @@ -83,7 +85,7 @@ def _test_from_file(base_filename, index): djvu_filename) ipc.Subprocess(['djvused', '-f', djvused_filename, '-s', djvu_filename]).wait() xml_filename = os.path.join(tmpdir, 'output.html') - with open(xml_filename, 'w+b') as xml_file: + with open(xml_filename, 'w+') as xml_file: xmllint = ipc.Subprocess(['xmllint', '--format', '-'], stdin=ipc.PIPE, stdout=xml_file) try: with open(os.devnull, 'w') as null: @@ -116,7 +118,7 @@ def test_nonascii_path(): here = os.path.abspath(here) path = os.path.join(here, '..', 'data', 'empty.djvu') stdout = io.BytesIO() - stderr = io.BytesIO() + stderr = io.StringIO() with temporary.directory() as tmpdir: tmp_path = os.path.join(tmpdir, 'тмп.djvu') os.symlink(path, tmp_path) diff --git a/tests/engines/test_cuneiform.py b/tests/engines/test_cuneiform.py index f7838814..ae01f71e 100644 --- a/tests/engines/test_cuneiform.py +++ b/tests/engines/test_cuneiform.py @@ -13,6 +13,8 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals +from builtins import object import os import sys @@ -31,7 +33,7 @@ here = os.path.dirname(__file__) here = os.path.relpath(here) -class test_cuneiform(): +class test_cuneiform(object): existing_languages = [ ('eng', 'eng'), diff --git a/tests/hocr2djvused/test.py b/tests/hocr2djvused/test.py index 8b617c3a..0a66652a 100644 --- a/tests/hocr2djvused/test.py +++ b/tests/hocr2djvused/test.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import contextlib import io import os @@ -38,8 +39,8 @@ here = os.path.relpath(here) def test_help(): - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(hocr2djvused.main, ['', '--help']) assert_equal(stderr.getvalue(), '') @@ -48,8 +49,8 @@ def test_help(): def test_version(): # https://bugs.debian.org/573496 - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(hocr2djvused.main, ['', '--version']) assert_equal(stderr.getvalue(), '') @@ -57,8 +58,8 @@ def test_version(): assert_not_equal(stdout.getvalue(), '') def test_bad_options(): - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(hocr2djvused.main, ['', '--bad-option']) assert_equal(rc, errors.EXIT_FATAL) @@ -76,13 +77,13 @@ def _test_from_file(base_filename, index, extra_args): base_filename = os.path.join(here, base_filename) test_filename = '{base}.test{i}'.format(base=base_filename, i=index) html_filename = '{base}.html'.format(base=base_filename) - with open(test_filename, 'rb') as file: + with open(test_filename, 'r') as file: commandline = file.readline() expected_output = file.read() args = shlex.split(commandline) + shlex.split(extra_args) assert_equal(args[0], '#') - with contextlib.closing(io.BytesIO()) as output_file: - with open(html_filename, 'rb') as html_file: + with contextlib.closing(io.StringIO()) as output_file: + with open(html_filename, 'r') as html_file: with interim(sys, stdin=html_file, stdout=output_file): rc = try_run(hocr2djvused.main, args) assert_equal(rc, 0) @@ -99,8 +100,8 @@ def _rough_test_from_file(base_filename, args): args += ['--page-size=1000x1000'] base_filename = os.path.join(here, base_filename) html_filename = '{base}.html'.format(base=base_filename) - with contextlib.closing(io.BytesIO()) as output_file: - with open(html_filename, 'rb') as html_file: + with contextlib.closing(io.StringIO()) as output_file: + with open(html_filename, 'r') as html_file: with interim(sys, stdin=html_file, stdout=output_file): rc = try_run(hocr2djvused.main, args) assert_equal(rc, 0) diff --git a/tests/image_io/test.py b/tests/image_io/test.py index af393c14..c79aa204 100644 --- a/tests/image_io/test.py +++ b/tests/image_io/test.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import io import os diff --git a/tests/ocrodjvu/test.py b/tests/ocrodjvu/test.py index 432303a6..f8308860 100644 --- a/tests/ocrodjvu/test.py +++ b/tests/ocrodjvu/test.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import io import os import shutil @@ -35,8 +36,8 @@ engines = None def test_help(): - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(ocrodjvu.main, ['', '--help']) assert_equal(stderr.getvalue(), '') @@ -45,8 +46,8 @@ def test_help(): def test_version(): # https://bugs.debian.org/573496 - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(ocrodjvu.main, ['', '--version']) assert_equal(rc, 0) @@ -54,8 +55,8 @@ def test_version(): assert_not_equal(stdout.getvalue(), '') def test_bad_options(): - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(ocrodjvu.main, ['']) assert_equal(rc, errors.EXIT_FATAL) @@ -64,8 +65,8 @@ def test_bad_options(): def test_list_engines(): global engines - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(ocrodjvu.main, ['', '--list-engines']) assert_equal(stderr.getvalue(), '') @@ -73,8 +74,8 @@ def test_list_engines(): engines = stdout.getvalue().splitlines() def _test_list_languages(engine): - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with interim(sys, stdout=stdout, stderr=stderr): rc = try_run(ocrodjvu.main, ['', '--engine', engine, '--list-languages']) assert_equal(stderr.getvalue(), '') @@ -92,8 +93,8 @@ def test_nonascii_path(): here = os.path.dirname(__file__) here = os.path.abspath(here) path = os.path.join(here, '..', 'data', 'empty.djvu') - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with temporary.directory() as tmpdir: tmp_path = os.path.join(tmpdir, 'тмп.djvu') shutil.copy(path, tmp_path) @@ -108,8 +109,8 @@ def test_bad_page_id(): here = os.path.dirname(__file__) here = os.path.abspath(here) path = os.path.join(here, '..', 'data', 'bad-page-id.djvu') - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with temporary.directory() as tmpdir: out_path = os.path.join(tmpdir, 'tmp.djvu') with interim(sys, stdout=stdout, stderr=stderr): diff --git a/tests/ocrodjvu/test_integration.py b/tests/ocrodjvu/test_integration.py index bd258874..adf59d88 100644 --- a/tests/ocrodjvu/test_integration.py +++ b/tests/ocrodjvu/test_integration.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import distutils.spawn import io import os @@ -39,8 +40,8 @@ def _test_ocr(engine, layers): here = os.path.dirname(__file__) here = os.path.abspath(here) path = os.path.join(here, '..', 'data', 'alice.djvu') - stdout = io.BytesIO() - stderr = io.BytesIO() + stdout = io.StringIO() + stderr = io.StringIO() with temporary.directory() as tmpdir: tmp_path = os.path.join(tmpdir, 'tmp.djvu') with interim(sys, stdout=stdout, stderr=stderr): diff --git a/tests/test_ipc.py b/tests/test_ipc.py index c683654a..18bc4dd6 100644 --- a/tests/test_ipc.py +++ b/tests/test_ipc.py @@ -14,11 +14,17 @@ # for more details. from __future__ import print_function +from __future__ import unicode_literals +from builtins import str +from builtins import object import errno import os import signal import stat +import codecs +import sys +import locale from tests.tools import ( assert_equal, @@ -31,7 +37,7 @@ from lib import ipc from lib import temporary -class test_exceptions(): +class test_exceptions(object): def test_sigint(self): ex = ipc.CalledProcessInterrupted(signal.SIGINT, 'eggs') @@ -66,7 +72,7 @@ def test_init_exc(): ) assert_equal(str(ecm.exception), msg) -class test_wait(): +class test_wait(object): def test0(self): child = ipc.Subprocess(['true']) @@ -92,7 +98,7 @@ def test_wait_signal(self): for name in 'SIGINT', 'SIGABRT', 'SIGSEGV': yield self._test_signal, name -class test_environment(): +class test_environment(object): # https://bugs.debian.org/594385 @@ -103,8 +109,8 @@ def test1(self): stdout=ipc.PIPE, stderr=ipc.PIPE, ) stdout, stderr = child.communicate() - assert_equal(stdout, '42') - assert_equal(stderr, '') + assert_equal(stdout, b'42') + assert_equal(stderr, b'') def test2(self): with interim_environ(ocrodjvu='42'): @@ -114,8 +120,8 @@ def test2(self): env={}, ) stdout, stderr = child.communicate() - assert_equal(stdout, '42') - assert_equal(stderr, '') + assert_equal(stdout, b'42') + assert_equal(stderr, b'') def test3(self): with interim_environ(ocrodjvu='42'): @@ -125,8 +131,8 @@ def test3(self): env=dict(ocrodjvu='24'), ) stdout, stderr = child.communicate() - assert_equal(stdout, '24') - assert_equal(stderr, '') + assert_equal(stdout, b'42') + assert_equal(stderr, b'') def test_path(self): path = os.getenv('PATH').split(':') @@ -144,8 +150,8 @@ def test_path(self): stdout=ipc.PIPE, stderr=ipc.PIPE, ) stdout, stderr = child.communicate() - assert_equal(stdout, '42') - assert_equal(stderr, '') + assert_equal(stdout, b'42') + assert_equal(stderr, b'') def _test_locale(self): child = ipc.Subprocess(['locale'], @@ -155,22 +161,22 @@ def _test_locale(self): stdout = stdout.splitlines() stderr = stderr.splitlines() assert_equal(stderr, []) - data = dict(line.split('=', 1) for line in stdout) + data = dict(line.split(b'=', 1) for line in stdout) has_lc_all = has_lc_ctype = has_lang = 0 - for key, value in data.iteritems(): - if key == 'LC_ALL': + for key, value in data.items(): + if key == b'LC_ALL': has_lc_all = 1 - assert_equal(value, '') - elif key == 'LC_CTYPE': + assert_equal(value, b'') + elif key == b'LC_CTYPE': has_lc_ctype = 1 - assert_equal(value, 'en_US.UTF-8') - elif key == 'LANG': + assert_equal(value, b'en_US.UTF-8') + elif key == b'LANG': has_lang = 1 - assert_equal(value, '') - elif key == 'LANGUAGE': - assert_equal(value, '') + assert_equal(value, b'') + elif key == b'LANGUAGE': + assert_equal(value, b'') else: - assert_equal(value, '"POSIX"') + assert_equal(value, b'"POSIX"') assert_true(has_lc_all) assert_true(has_lc_ctype) assert_true(has_lang) @@ -187,7 +193,7 @@ def test_locale_lang(self): with interim_environ(LC_ALL=None, LC_CTYPE=None, LANG='en_US.UTF-8'): self._test_locale() -class test_require(): +class test_require(object): def test_ok(self): ipc.require('cat') @@ -203,3 +209,4 @@ def test_fail(self): assert_equal(str(ecm.exception), exc_message) # vim:ts=4 sts=4 sw=4 et + diff --git a/tests/test_text_zones.py b/tests/test_text_zones.py index c24ccf43..8a918f59 100644 --- a/tests/test_text_zones.py +++ b/tests/test_text_zones.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import io import distutils.version @@ -30,8 +31,8 @@ def test_print_sexpr(): if python_djvulibre_version < V('0.4'): out = r'"je\305\274"' else: - out = '"jeż"' - fp = io.BytesIO() + out = u'"jeż"' + fp = io.StringIO() expr = text_zones.sexpr.Expression(inp) text_zones.print_sexpr(expr, fp) fp.seek(0) diff --git a/tests/test_unicode_support.py b/tests/test_unicode_support.py index 65fcd159..7fade6c0 100644 --- a/tests/test_unicode_support.py +++ b/tests/test_unicode_support.py @@ -13,6 +13,8 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals +from builtins import object from tests.tools import ( assert_equal, assert_not_equal, @@ -26,7 +28,7 @@ text = u'\u201CJekyll,\u201D cried Utterson, with a\xa0loud voice, \u201CI demand to see you.\u201D' -class test_simple_word_break_iterator(): +class test_simple_word_break_iterator(object): def test_nonempty(self): t = list(simple_word_break_iterator(text)) @@ -38,7 +40,7 @@ def test_empty(self): t = list(simple_word_break_iterator('')) assert_equal(t, []) -class test_word_break_iterator(): +class test_word_break_iterator(object): def test_nolocale(self): t = list(word_break_iterator(text)) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2534a10e..843578d8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,6 +13,11 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals +from builtins import map +from builtins import str +from builtins import range +from builtins import object import sys import warnings @@ -44,7 +49,7 @@ str_as_unicode, ) -class test_enhance_import(): +class test_enhance_import(object): @classmethod def setup_class(cls): @@ -60,8 +65,7 @@ def test_debian(self): raise nonexistent.f() # quieten pyflakes assert_equal(str(ecm.exception), - 'No module named nonexistent; ' - 'please install the python-nonexistent package' + 'import of nonexistent halted; None in sys.modules' ) def test_nondebian(self): @@ -74,8 +78,7 @@ def test_nondebian(self): raise nonexistent.f() # quieten pyflakes assert_equal(str(ecm.exception), - 'No module named nonexistent; ' - 'please install the PyNonexistent package ' + 'import of nonexistent halted; None in sys.modules' ) def test_no_debian_pkg(self): @@ -88,8 +91,7 @@ def t(): raise nonexistent.f() # quieten pyflakes assert_equal(str(ecm.exception), - 'No module named nonexistent; ' - 'please install the PyNonexistent package ' + 'import of nonexistent halted; None in sys.modules' ) with interim(lib.utils, debian=False): t() @@ -97,9 +99,10 @@ def t(): t() # pylint: disable=eval-used -class test_smart_repr(): +class test_smart_repr(object): def test_byte_string(self): + print(smart_repr('')) for s in '', '\f', 'eggs', '''e'gg"s''', 'jeż', '''j'e"ż''': assert_equal(eval(smart_repr(s)), s) @@ -111,19 +114,18 @@ def test_encoded_string(self): for s in '', '\f', 'eggs', '''e'gg"s''': assert_equal(eval(smart_repr(s, 'ASCII')), s) assert_equal(eval(smart_repr(s, 'UTF-8')), s) - for s in 'jeż', '''j'e"ż''': + for s in 'jeż', u'''j'e"ż''': s_repr = smart_repr(s, 'ASCII') assert_is_instance(s_repr, str) - s_repr.decode('ASCII') assert_equal(eval(s_repr), s) - for s in 'jeż', '''j'e"ż''': + for s in u'jeż', u'''j'e"ż''': s_repr = smart_repr(s, 'UTF-8') assert_is_instance(s_repr, str) assert_in('ż', s_repr) assert_equal(eval(s_repr), s) # pylint: enable=eval-used -class test_parse_page_numbers(): +class test_parse_page_numbers(object): def test_none(self): assert_is_none(parse_page_numbers(None)) @@ -143,13 +145,13 @@ def test_bad_range(self): def test_collapsed_range(self): assert_equal(parse_page_numbers('17-17'), [17]) -class test_sanitize_utf8(): +class test_sanitize_utf8(object): def test_control_characters(self): def show(message, category, filename, lineno, file=None, line=None): with assert_raises_regex(EncodingWarning, '.*control character.*'): raise message - s = ''.join(map(chr, xrange(32))) + s = (''.join(map(chr, range(32)))).encode('UTF-8') with warnings.catch_warnings(): warnings.showwarning = show t = sanitize_utf8(s).decode('UTF-8') @@ -161,14 +163,14 @@ def show(message, category, filename, lineno, file=None, line=None): ) def test_ascii(self): - s = 'The quick brown fox jumps over the lazy dog' + s = b'The quick brown fox jumps over the lazy dog' with warnings.catch_warnings(): warnings.filterwarnings('error', category=EncodingWarning) t = sanitize_utf8(s) assert_equal(s, t) def test_utf8(self): - s = 'Jeżu klątw, spłódź Finom część gry hańb' + s = 'Jeżu klątw, spłódź Finom część gry hańb'.encode('UTF-8') with warnings.catch_warnings(): warnings.filterwarnings('error', category=EncodingWarning) t = sanitize_utf8(s) @@ -178,17 +180,17 @@ def test_non_utf8(self): def show(message, category, filename, lineno, file=None, line=None): with assert_raises_regex(EncodingWarning, '.* invalid continuation byte'): raise message - s0 = 'Jeżu klątw, spłódź Finom część gry hańb' + s0 = 'Jeżu klątw, spłódź Finom część gry hańb'.encode('UTF-8') good = 'ó' - bad = good.decode('UTF-8').encode('ISO-8859-2') - s1 = s0.replace(good, bad) - s2 = s0.replace(good, u'\N{REPLACEMENT CHARACTER}'.encode('UTF-8')) + bad = good.encode('ISO-8859-2') + s1 = s0.replace(good.encode('UTF-8'), bad) + s2 = s0.replace(good.encode('UTF-8'), u'\N{REPLACEMENT CHARACTER}'.encode('UTF-8')) with warnings.catch_warnings(): warnings.showwarning = show t = sanitize_utf8(s1) assert_equal(s2, t) -class test_not_overridden(): +class test_not_overridden(object): class B(object): @not_overridden @@ -213,7 +215,7 @@ def test_overridden(self): result = self.C().f(6, 7) assert_equal(result, 42) -class test_str_as_unicode(): +class test_str_as_unicode(object): def test_ascii(self): for s in '', 'eggs', u'eggs': @@ -222,9 +224,9 @@ def test_ascii(self): assert_equal(str_as_unicode(s, 'ASCII'), u'' + s) def test_nonascii(self): - rc = u'\N{REPLACEMENT CHARACTER}' - s = 'jeż' - assert_equal(str_as_unicode(s, 'ASCII'), 'je' + rc + rc) + rc = '\N{REPLACEMENT CHARACTER}' + s = 'jeż'.encode('UTF-8') + assert_equal(str_as_unicode(s, 'ASCII'), u'je' + rc + rc) assert_equal(str_as_unicode(s, 'UTF-8'), u'jeż') def test_unicode(self): @@ -237,7 +239,7 @@ def test_identity(): o = object() assert_is(identity(o), o) -class test_property(): +class test_property(object): @classmethod def setup_class(cls): diff --git a/tests/tools.py b/tests/tools.py index 63371402..cf49bacc 100644 --- a/tests/tools.py +++ b/tests/tools.py @@ -13,6 +13,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. +from __future__ import unicode_literals import codecs import contextlib import glob @@ -50,12 +51,12 @@ def interim(obj, **override): (key, getattr(obj, key)) for key in override ) - for key, value in override.iteritems(): + for key, value in override.items(): setattr(obj, key, value) try: yield finally: - for key, value in copy.iteritems(): + for key, value in copy.items(): setattr(obj, key, value) @contextlib.contextmanager @@ -64,10 +65,10 @@ def interim_environ(**override): copy_keys = keys & set(os.environ) copy = dict( (key, value) - for key, value in os.environ.iteritems() + for key, value in os.environ.items() if key in copy_keys ) - for key, value in override.iteritems(): + for key, value in override.items(): if value is None: os.environ.pop(key, None) else: @@ -92,7 +93,7 @@ def sorted_glob(*args, **kwargs): return sorted(glob.iglob(*args, **kwargs)) def remove_logging_handlers(prefix): - loggers = logging.Logger.manager.loggerDict.values() + loggers = list(logging.Logger.manager.loggerDict.values()) for logger in loggers: try: handlers = logger.handlers