diff --git a/src/training/language_specific.py b/src/training/language_specific.py index 8c7d27bfd2..388ec5c178 100644 --- a/src/training/language_specific.py +++ b/src/training/language_specific.py @@ -19,8 +19,8 @@ # Language specific info # ============================================================================= -import os import logging +import os log = logging.getLogger(__name__) @@ -875,6 +875,7 @@ FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "") + # Set language-specific values for several global variables, including # ${TEXT_CORPUS} # holds the text corpus file for the language, used in phase F @@ -1079,15 +1080,15 @@ def set_lang_specific_parameters(ctx, lang): NUMBER_DAWG_FACTOR = 0.05 WORD_DAWG_SIZE = 1_000_000 elif lang in ( - "aze_cyrl", - "bel", - "bul", - "kaz", - "mkd", - "srp", - "tgk", - "ukr", - "uzb_cyrl", + "aze_cyrl", + "bel", + "bul", + "kaz", + "mkd", + "srp", + "tgk", + "ukr", + "uzb_cyrl", ): MIX_LANG = f"{lang}" if not FONTS: @@ -1326,44 +1327,44 @@ def set_lang_specific_parameters(ctx, lang): EXPOSURES = [0] # Set right-to-left and normalization mode. if lang in ( - "ara", - "div", - "fas", - "pus", - "snd", - "syr", - "uig", - "urd", - "kur_ara", - "heb", - "yid", + "ara", + "div", + "fas", + "pus", + "snd", + "syr", + "uig", + "urd", + "kur_ara", + "heb", + "yid", ): LANG_IS_RTL = True NORM_MODE = 2 elif lang in ( - "asm", - "ben", - "bih", - "hin", - "mar", - "nep", - "guj", - "kan", - "mal", - "tam", - "tel", - "pan", - "dzo", - "sin", - "san", - "bod", - "ori", - "khm", - "mya", - "tha", - "lao", - "jav ", - "jav_java", + "asm", + "ben", + "bih", + "hin", + "mar", + "nep", + "guj", + "kan", + "mal", + "tam", + "tel", + "pan", + "dzo", + "sin", + "san", + "bod", + "ori", + "khm", + "mya", + "tha", + "lao", + "jav ", + "jav_java", ): LANG_IS_RTL = False NORM_MODE = 2 @@ -1408,7 +1409,6 @@ def set_lang_specific_parameters(ctx, lang): return ctx - # ============================================================================= # END of Language specific info # ============================================================================= diff --git a/src/training/tesstrain.py b/src/training/tesstrain.py index 47e40135a8..bd734fff47 100755 --- a/src/training/tesstrain.py +++ b/src/training/tesstrain.py @@ -15,10 +15,10 @@ # This script provides an easy way to execute various phases of training # Tesseract. For a detailed description of the phases, see # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract -# -import sys -import os + import logging +import os +import sys if (sys.version_info.major < 3) or (sys.version_info.major == 3 and sys.version_info.minor < 6): raise Exception("Must be using Python minimum version 3.6!") @@ -86,7 +86,6 @@ def main(): if __name__ == "__main__": main() - # _rc0 = subprocess.call(["tlog","\n=== Starting training for language '"+str(LANG_CODE.val)+"'"],shell=True) # _rc0 = subprocess.call(["source",os.popen("dirname "+__file__).read().rstrip("\n")+"/language-specific.sh"],shell=True) # _rc0 = subprocess.call(["set_lang_specific_parameters",str(LANG_CODE.val)],shell=True) diff --git a/src/training/tesstrain_utils.py b/src/training/tesstrain_utils.py index 9b35466a84..600acfb10a 100644 --- a/src/training/tesstrain_utils.py +++ b/src/training/tesstrain_utils.py @@ -14,19 +14,19 @@ # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract # +import argparse +import atexit +import concurrent.futures +import logging import os +import pathlib import platform +import shutil +import subprocess import sys from datetime import date -from tempfile import TemporaryDirectory, mkdtemp -import pathlib -import logging -import subprocess -import argparse from operator import itemgetter -import concurrent.futures -import shutil -import atexit +from tempfile import TemporaryDirectory, mkdtemp from tqdm import tqdm @@ -247,18 +247,18 @@ def show_tmpdir_location(training_dir): # specified in the command-line. if not ctx.training_text: ctx.training_text = ( - pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text" + pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text" ) if not ctx.wordlist_file: ctx.wordlist_file = ( - pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist" + pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist" ) ctx.word_bigrams_file = ( - pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams" + pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams" ) ctx.numbers_file = ( - pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers" + pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers" ) ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc" ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix( @@ -307,7 +307,6 @@ def make_outbase(ctx, fontname, exposure): # Helper function for phaseI_generate_image. Generates the image for a single # language/font combination in a way that can be run in parallel. def generate_font_image(ctx, font, exposure, char_spacing): - log.info(f"Rendering using {font}") fontname = make_fontname(font) outbase = make_outbase(ctx, fontname, exposure) @@ -358,7 +357,6 @@ def generate_font_image(ctx, font, exposure, char_spacing): # Phase I : Generate (I)mages from training text for each font. def phase_I_generate_image(ctx, par_factor): - if not par_factor or par_factor <= 0: par_factor = 1 @@ -387,8 +385,8 @@ def phase_I_generate_image(ctx, par_factor): check_file_readable(ctx.train_ngrams_file) with tqdm( - total=len(ctx.fonts) - ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + total=len(ctx.fonts) + ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: futures = [ executor.submit(generate_font_image, ctx, font, exposure, char_spacing) for font in ctx.fonts @@ -533,7 +531,7 @@ def phase_E_extract_features(ctx, box_config, ext): log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}") with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor( - max_workers=2 + max_workers=2 ) as executor: futures = [] for img_file in img_files: @@ -693,7 +691,6 @@ def get_file_list(): dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf")) pathlib.Path(lstm_list).write_text("\n".join(dir_listing)) - # make__traineddata() { # tlog "\n=== Making final traineddata file ===" # local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}