From 297d7d86ceb3b3365876b56f08a18cba066bf23f Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Fri, 15 Mar 2019 16:06:19 +0100 Subject: [PATCH] trying to add user words/patterns again: - pass in ParamsVectors from Tesseract (carrying values from langdata/config/api) into LSTMRecognizer::Load and LoadDictionary - after LSTMRecognizer's Dict is initialised (with default values), reset the variables user_{words,patterns}_{suffix,file} from the corresponding entries in the passed vector --- src/ccmain/tessedit.cpp | 2 +- src/ccutil/params.h | 34 +++++++++++++++++++++++++++++- src/dict/dict.cpp | 41 +++++++++++++++++++++++++++++++++++++ src/lstm/lstmrecognizer.cpp | 11 +++++++--- src/lstm/lstmrecognizer.h | 5 +++-- 5 files changed, 86 insertions(+), 7 deletions(-) diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp index 1822d44905..30ee96e785 100644 --- a/src/ccmain/tessedit.cpp +++ b/src/ccmain/tessedit.cpp @@ -186,7 +186,7 @@ bool Tesseract::init_tesseract_lang_data( if (mgr->IsComponentAvailable(TESSDATA_LSTM)) { lstm_recognizer_ = new LSTMRecognizer; ASSERT_HOST( - lstm_recognizer_->Load(lstm_use_matrix ? language : nullptr, mgr)); + lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : nullptr, mgr)); } else { tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n"); tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); diff --git a/src/ccutil/params.h b/src/ccutil/params.h index 742c89848f..7e22785773 100644 --- a/src/ccutil/params.h +++ b/src/ccutil/params.h @@ -155,7 +155,15 @@ class IntParam : public Param { void ResetToDefault() { value_ = default_; } - + void ResetFrom(const ParamsVectors* vec) { + for (int i = 0; i < vec->int_params.size(); ++i) { + if (strcmp(vec->int_params[i]->name_str(), name_) == 0) { + //printf("overriding param %s=%d by =%d\n", name_, value_, *vec->int_params[i]); + value_ = *vec->int_params[i]; + } + } + } + private: int32_t value_; int32_t default_; @@ -179,6 +187,14 @@ class BoolParam : public Param { void ResetToDefault() { value_ = default_; } + void ResetFrom(const ParamsVectors* vec) { + for (int i = 0; i < vec->bool_params.size(); ++i) { + if (strcmp(vec->bool_params[i]->name_str(), name_) == 0) { + //printf("overriding param %s=%s by =%s\n", name_, value_ ? "true" : "false", *vec->bool_params[i] ? "true" : "false"); + value_ = *vec->bool_params[i]; + } + } + } private: BOOL8 value_; @@ -208,6 +224,14 @@ class StringParam : public Param { void ResetToDefault() { value_ = default_; } + void ResetFrom(const ParamsVectors* vec) { + for (int i = 0; i < vec->string_params.size(); ++i) { + if (strcmp(vec->string_params[i]->name_str(), name_) == 0) { + //printf("overriding param %s=%s by =%s\n", name_, value_, vec->string_params[i]->c_str()); + value_ = *vec->string_params[i]; + } + } + } private: STRING value_; @@ -232,6 +256,14 @@ class DoubleParam : public Param { void ResetToDefault() { value_ = default_; } + void ResetFrom(const ParamsVectors* vec) { + for (int i = 0; i < vec->double_params.size(); ++i) { + if (strcmp(vec->double_params[i]->name_str(), name_) == 0) { + //printf("overriding param %s=%f by =%f\n", name_, value_, *vec->double_params[i]); + value_ = *vec->double_params[i]; + } + } + } private: double value_; diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp index 81eb3c7b5b..f2f1903a3c 100644 --- a/src/dict/dict.cpp +++ b/src/dict/dict.cpp @@ -316,6 +316,47 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) { lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) dawgs_ += number_dawg; } + + // stolen from Dict::Load (but needs params_ from Tesseract langdata/config/api): + STRING name; + if (((STRING &)user_words_suffix).length() > 0 || + ((STRING &)user_words_file).length() > 0) { + Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, + getUnicharset().size(), dawg_debug_level); + if (((STRING &)user_words_file).length() > 0) { + name = user_words_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_words_suffix; + } + if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(), + Trie::RRP_REVERSE_IF_HAS_RTL)) { + tprintf("Error: failed to load %s\n", name.string()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + + if (((STRING &)user_patterns_suffix).length() > 0 || + ((STRING &)user_patterns_file).length() > 0) { + Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, + getUnicharset().size(), dawg_debug_level); + trie_ptr->initialize_patterns(&(getUnicharset())); + if (((STRING &)user_patterns_file).length() > 0) { + name = user_patterns_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_patterns_suffix; + } + if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) { + tprintf("Error: failed to load %s\n", name.string()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + } // Completes the loading process after Load() and/or LoadLSTM(). diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp index acbc36f12b..bf6ee98f40 100644 --- a/src/lstm/lstmrecognizer.cpp +++ b/src/lstm/lstmrecognizer.cpp @@ -66,13 +66,13 @@ LSTMRecognizer::~LSTMRecognizer() { } // Loads a model from mgr, including the dictionary only if lang is not null. -bool LSTMRecognizer::Load(const char* lang, TessdataManager* mgr) { +bool LSTMRecognizer::Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr) { TFile fp; if (!mgr->GetComponent(TESSDATA_LSTM, &fp)) return false; if (!DeSerialize(mgr, &fp)) return false; if (lang == nullptr) return true; // Allow it to run without a dictionary. - LoadDictionary(lang, mgr); + LoadDictionary(params, lang, mgr); return true; } @@ -154,9 +154,14 @@ bool LSTMRecognizer::LoadRecoder(TFile* fp) { // on the unicharset matching. This enables training to deserialize a model // from checkpoint or restore without having to go back and reload the // dictionary. -bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) { +// Some parameters have to be passed in (from langdata/config/api via Tesseract) +bool LSTMRecognizer::LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr) { delete dict_; dict_ = new Dict(&ccutil_); + dict_->user_words_file.ResetFrom(params); + dict_->user_words_suffix.ResetFrom(params); + dict_->user_patterns_file.ResetFrom(params); + dict_->user_patterns_suffix.ResetFrom(params); dict_->SetupForLoad(Dict::GlobalDawgCache()); dict_->LoadLSTM(lang, mgr); if (dict_->FinishLoad()) return true; // Success. diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h index 75054b8e8c..5c0ec53a14 100644 --- a/src/lstm/lstmrecognizer.h +++ b/src/lstm/lstmrecognizer.h @@ -25,6 +25,7 @@ #include "matrix.h" #include "network.h" #include "networkscratch.h" +#include "params.h" #include "recodebeam.h" #include "series.h" #include "strngs.h" @@ -154,7 +155,7 @@ class LSTMRecognizer { int null_char() const { return null_char_; } // Loads a model from mgr, including the dictionary only if lang is not null. - bool Load(const char* lang, TessdataManager* mgr); + bool Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr); // Writes to the given file. Returns false in case of error. // If mgr contains a unicharset and recoder, then they are not encoded to fp. @@ -174,7 +175,7 @@ class LSTMRecognizer { // on the unicharset matching. This enables training to deserialize a model // from checkpoint or restore without having to go back and reload the // dictionary. - bool LoadDictionary(const char* lang, TessdataManager* mgr); + bool LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr); // Recognizes the line image, contained within image_data, returning the // recognized tesseract WERD_RES for the words.