trying to add tessedit_char_whitelist etc. again:

- ignore matrix outputs in ComputeTopN if they belong to a disabled unichar_id - pass UNICHARSET refs to check that - in SetBlackAndWhitelist, also update the unicharset of the lstm_recognizer_ instance, if any
tesseract-ocr · Mar 7, 2019 · 6ac2ff0 · 6ac2ff0
1 parent fe5c82f
commit 6ac2ff0
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 4 deletions.
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
@@ -619,6 +619,12 @@ void Tesseract::SetBlackAndWhitelist() {
   unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
                                      tessedit_char_whitelist.string(),
                                      tessedit_char_unblacklist.string());
+  if (lstm_recognizer_) {
+    UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
+    lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
+                                            tessedit_char_whitelist.string(),
+                                            tessedit_char_unblacklist.string());
+  }
   // Black and white lists should apply to all loaded classifiers.
   for (int i = 0; i < sub_langs_.size(); ++i) {
     sub_langs_[i]->unicharset.set_black_and_whitelist(

diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp
@@ -87,7 +87,7 @@ void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
   if (lstm_choice_mode)
     timesteps.clear();
   for (int t = 0; t < width; ++t) {
-    ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
+    ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0], charset);
     DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
                charset);
     if (lstm_choice_mode) {
@@ -102,7 +102,7 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
   beam_size_ = 0;
   int width = output.dim1();
   for (int t = 0; t < width; ++t) {
-    ComputeTopN(output[t], output.dim2(), kBeamWidths[0]);
+    ComputeTopN(output[t], output.dim2(), kBeamWidths[0], charset);
     DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset);
   }
 }
@@ -456,12 +456,19 @@ WERD_RES* RecodeBeamSearch::InitializeWord(bool leading_space,
 // Fills top_n_flags_ with bools that are true iff the corresponding output
 // is one of the top_n.
 void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
-                                   int top_n) {
+                                   int top_n, const UNICHARSET* charset) {
   top_n_flags_.init_to_size(num_outputs, TN_ALSO_RAN);
   top_code_ = -1;
   second_code_ = -1;
   top_heap_.clear();
   for (int i = 0; i < num_outputs; ++i) {
+    // Decode label via recoder_.
+    RecodedCharID code;
+    code.Set(0, i);
+    int label = recoder_.DecodeUnichar(code);
+    if (label != INVALID_UNICHAR_ID && // not part of a bigger code.
+        !charset->get_enabled(label)) // disabled
+      continue;
     if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key) {
       TopPair entry(outputs[i], i);
       top_heap_.Push(&entry);

diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h
@@ -293,7 +293,7 @@ class RecodeBeamSearch {
 
   // Fills top_n_flags_ with bools that are true iff the corresponding output
   // is one of the top_n.
-  void ComputeTopN(const float* outputs, int num_outputs, int top_n);
+  void ComputeTopN(const float* outputs, int num_outputs, int top_n, const UNICHARSET* unicharset);
 
   // Adds the computation for the current time-step to the beam. Call at each
   // time-step in sequence from left to right. outputs is the activation vector