diff --git a/.gitignore b/.gitignore index 86a2bd4..fd6ab28 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ build/ dist/ pyocr.egg-info/ +venv +.idea diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py index e587ec8..efe3bdb 100755 --- a/src/pyocr/tesseract.py +++ b/src/pyocr/tesseract.py @@ -33,6 +33,9 @@ # CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract' +# Specifying the OMP_THREAD_LIMIT=1 parameter is the only way to make the calls to tesseract play nice with multiprocessing +# For more info see: https://github.com/tesseract-ocr/tesseract/issues/898 +THREAD_LIMIT_CMD = 'OMP_THREAD_LIMIT=1' TESSDATA_EXTENSION = ".traineddata" @@ -230,7 +233,7 @@ def get_available_builders(): def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None, - flags=None, configs=None): + flags=None, configs=None, parallel=True): ''' Runs Tesseract: `TESSERACT_CMD` \ @@ -254,7 +257,12 @@ def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None, ''' _set_environment() - command = [TESSERACT_CMD, input_filename, output_filename_base] + tesseract_cmd = TESSERACT_CMD + + if parallel: + tesseract_cmd = " ".join([THREAD_LIMIT_CMD, TESSERACT_CMD]) + + command = [tesseract_cmd, input_filename, output_filename_base] if lang is not None: command += ['-l', lang] @@ -325,7 +333,7 @@ def temp_dir(): shutil.rmtree(path) -def image_to_string(image, lang=None, builder=None): +def image_to_string(image, lang=None, builder=None, parallel=True): ''' Runs tesseract on the specified image. First, the image is written to disk, and then the tesseract command is run on the image. Tesseract's result is @@ -353,7 +361,8 @@ def image_to_string(image, lang=None, builder=None): (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir, lang=lang, flags=builder.tesseract_flags, - configs=builder.tesseract_configs) + configs=builder.tesseract_configs, + parallel=parallel) if status: raise TesseractError(status, errors)