diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index 128114d3f2..741d46c643 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -619,6 +619,12 @@ void Tesseract::SetBlackAndWhitelist() { unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(), tessedit_char_whitelist.string(), tessedit_char_unblacklist.string()); + if (lstm_recognizer_) { + UNICHARSET& lstm_unicharset = const_cast (lstm_recognizer_->GetUnicharset()); + lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(), + tessedit_char_whitelist.string(), + tessedit_char_unblacklist.string()); + } // Black and white lists should apply to all loaded classifiers. for (int i = 0; i < sub_langs_.size(); ++i) { sub_langs_[i]->unicharset.set_black_and_whitelist( diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 7f5dbfda96..2635481de5 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -87,7 +87,7 @@ void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio, if (lstm_choice_mode) timesteps.clear(); for (int t = 0; t < width; ++t) { - ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]); + ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0], charset); DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, charset); if (lstm_choice_mode) { @@ -102,7 +102,7 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY& output, beam_size_ = 0; int width = output.dim1(); for (int t = 0; t < width; ++t) { - ComputeTopN(output[t], output.dim2(), kBeamWidths[0]); + ComputeTopN(output[t], output.dim2(), kBeamWidths[0], charset); DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset); } } @@ -456,12 +456,19 @@ WERD_RES* RecodeBeamSearch::InitializeWord(bool leading_space, // Fills top_n_flags_ with bools that are true iff the corresponding output // is one of the top_n. void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs, - int top_n) { + int top_n, const UNICHARSET* charset) { top_n_flags_.init_to_size(num_outputs, TN_ALSO_RAN); top_code_ = -1; second_code_ = -1; top_heap_.clear(); for (int i = 0; i < num_outputs; ++i) { + // Decode label via recoder_. + RecodedCharID code; + code.Set(0, i); + int label = recoder_.DecodeUnichar(code); + if (label != INVALID_UNICHAR_ID && // not part of a bigger code. + !charset->get_enabled(label)) // disabled + continue; if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key) { TopPair entry(outputs[i], i); top_heap_.Push(&entry); diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index 6bd44653b4..5db77b4b7c 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -293,7 +293,7 @@ class RecodeBeamSearch { // Fills top_n_flags_ with bools that are true iff the corresponding output // is one of the top_n. - void ComputeTopN(const float* outputs, int num_outputs, int top_n); + void ComputeTopN(const float* outputs, int num_outputs, int top_n, const UNICHARSET* unicharset); // Adds the computation for the current time-step to the beam. Call at each // time-step in sequence from left to right. outputs is the activation vector