Skip to content

Commit

Permalink
python: optimize imports, reformat code
Browse files Browse the repository at this point in the history
  • Loading branch information
zdenop committed Mar 31, 2019
1 parent 2e9fd69 commit 5f06402
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 67 deletions.
90 changes: 45 additions & 45 deletions src/training/language_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
# Language specific info
# =============================================================================

import os
import logging
import os

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -875,6 +875,7 @@

FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")


# Set language-specific values for several global variables, including
# ${TEXT_CORPUS}
# holds the text corpus file for the language, used in phase F
Expand Down Expand Up @@ -1079,15 +1080,15 @@ def set_lang_specific_parameters(ctx, lang):
NUMBER_DAWG_FACTOR = 0.05
WORD_DAWG_SIZE = 1_000_000
elif lang in (
"aze_cyrl",
"bel",
"bul",
"kaz",
"mkd",
"srp",
"tgk",
"ukr",
"uzb_cyrl",
"aze_cyrl",
"bel",
"bul",
"kaz",
"mkd",
"srp",
"tgk",
"ukr",
"uzb_cyrl",
):
MIX_LANG = f"{lang}"
if not FONTS:
Expand Down Expand Up @@ -1326,44 +1327,44 @@ def set_lang_specific_parameters(ctx, lang):
EXPOSURES = [0]
# Set right-to-left and normalization mode.
if lang in (
"ara",
"div",
"fas",
"pus",
"snd",
"syr",
"uig",
"urd",
"kur_ara",
"heb",
"yid",
"ara",
"div",
"fas",
"pus",
"snd",
"syr",
"uig",
"urd",
"kur_ara",
"heb",
"yid",
):
LANG_IS_RTL = True
NORM_MODE = 2
elif lang in (
"asm",
"ben",
"bih",
"hin",
"mar",
"nep",
"guj",
"kan",
"mal",
"tam",
"tel",
"pan",
"dzo",
"sin",
"san",
"bod",
"ori",
"khm",
"mya",
"tha",
"lao",
"jav ",
"jav_java",
"asm",
"ben",
"bih",
"hin",
"mar",
"nep",
"guj",
"kan",
"mal",
"tam",
"tel",
"pan",
"dzo",
"sin",
"san",
"bod",
"ori",
"khm",
"mya",
"tha",
"lao",
"jav ",
"jav_java",
):
LANG_IS_RTL = False
NORM_MODE = 2
Expand Down Expand Up @@ -1408,7 +1409,6 @@ def set_lang_specific_parameters(ctx, lang):

return ctx


# =============================================================================
# END of Language specific info
# =============================================================================
7 changes: 3 additions & 4 deletions src/training/tesstrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
# This script provides an easy way to execute various phases of training
# Tesseract. For a detailed description of the phases, see
# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
#
import sys
import os

import logging
import os
import sys

if (sys.version_info.major < 3) or (sys.version_info.major == 3 and sys.version_info.minor < 6):
raise Exception("Must be using Python minimum version 3.6!")
Expand Down Expand Up @@ -86,7 +86,6 @@ def main():
if __name__ == "__main__":
main()


# _rc0 = subprocess.call(["tlog","\n=== Starting training for language '"+str(LANG_CODE.val)+"'"],shell=True)
# _rc0 = subprocess.call(["source",os.popen("dirname "+__file__).read().rstrip("\n")+"/language-specific.sh"],shell=True)
# _rc0 = subprocess.call(["set_lang_specific_parameters",str(LANG_CODE.val)],shell=True)
Expand Down
33 changes: 15 additions & 18 deletions src/training/tesstrain_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@
# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
#

import argparse
import atexit
import concurrent.futures
import logging
import os
import pathlib
import platform
import shutil
import subprocess
import sys
from datetime import date
from tempfile import TemporaryDirectory, mkdtemp
import pathlib
import logging
import subprocess
import argparse
from operator import itemgetter
import concurrent.futures
import shutil
import atexit
from tempfile import TemporaryDirectory, mkdtemp

from tqdm import tqdm

Expand Down Expand Up @@ -247,18 +247,18 @@ def show_tmpdir_location(training_dir):
# specified in the command-line.
if not ctx.training_text:
ctx.training_text = (
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
)
if not ctx.wordlist_file:
ctx.wordlist_file = (
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
)

ctx.word_bigrams_file = (
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
)
ctx.numbers_file = (
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
)
ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
Expand Down Expand Up @@ -307,7 +307,6 @@ def make_outbase(ctx, fontname, exposure):
# Helper function for phaseI_generate_image. Generates the image for a single
# language/font combination in a way that can be run in parallel.
def generate_font_image(ctx, font, exposure, char_spacing):

log.info(f"Rendering using {font}")
fontname = make_fontname(font)
outbase = make_outbase(ctx, fontname, exposure)
Expand Down Expand Up @@ -358,7 +357,6 @@ def generate_font_image(ctx, font, exposure, char_spacing):

# Phase I : Generate (I)mages from training text for each font.
def phase_I_generate_image(ctx, par_factor):

if not par_factor or par_factor <= 0:
par_factor = 1

Expand Down Expand Up @@ -387,8 +385,8 @@ def phase_I_generate_image(ctx, par_factor):
check_file_readable(ctx.train_ngrams_file)

with tqdm(
total=len(ctx.fonts)
) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
total=len(ctx.fonts)
) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
futures = [
executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
for font in ctx.fonts
Expand Down Expand Up @@ -533,7 +531,7 @@ def phase_E_extract_features(ctx, box_config, ext):
log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")

with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor(
max_workers=2
max_workers=2
) as executor:
futures = []
for img_file in img_files:
Expand Down Expand Up @@ -693,7 +691,6 @@ def get_file_list():
dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
pathlib.Path(lstm_list).write_text("\n".join(dir_listing))


# make__traineddata() {
# tlog "\n=== Making final traineddata file ==="
# local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}
Expand Down

0 comments on commit 5f06402

Please sign in to comment.