Fixed multilang for LSTM, pushed cube to one side without actually de…

…leting it
tesseract-ocr · Dec 5, 2016 · 5deebe6 · tfmorris · Mar 24, 2017 · amitdo
1 parent 798d79a
commit 5deebe6
Show file tree

Hide file tree

Showing 14 changed files with 139 additions and 124 deletions.
diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp
@@ -123,10 +123,9 @@ void PrintHelpForOEM() {
   const char* msg =
       "OCR Engine modes:\n"
       "  0    Original Tesseract only.\n"
-      "  1    Cube only.\n"
-      "  2    Tesseract + cube.\n"
-      "  3    Default, based on what is available.\n"
-      "  4    Neural nets (LSTM) only.\n";
+      "  1    Neural nets LSTM only.\n"
+      "  2    Tesseract + LSTM.\n"
+      "  3    Default, based on what is available.\n";
 
   printf("%s", msg);
 }

diff --git a/ccmain/control.cpp b/ccmain/control.cpp
@@ -31,21 +31,22 @@
 #include <errno.h>
 #endif
 #include <ctype.h>
-#include "ocrclass.h"
-#include "werdit.h"
+#include "callcpp.h"
+#include "control.h"
+#include "docqual.h"
 #include "drawfx.h"
-#include "tessbox.h"
-#include "tessvars.h"
-#include "pgedit.h"
-#include "reject.h"
 #include "fixspace.h"
-#include "docqual.h"
-#include "control.h"
-#include "output.h"
-#include "callcpp.h"
 #include "globals.h"
+#include "lstmrecognizer.h"
+#include "ocrclass.h"
+#include "output.h"
+#include "pgedit.h"
+#include "reject.h"
 #include "sorthelper.h"
+#include "tessbox.h"
 #include "tesseractclass.h"
+#include "tessvars.h"
+#include "werdit.h"
 
 #define MIN_FONT_ROW_COUNT  8
 #define MAX_XHEIGHT_DIFF  3
@@ -192,8 +193,8 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
       WERD_RES* word_res = new WERD_RES;
       word_res->InitForRetryRecognition(*word->word);
       word->lang_words.push_back(word_res);
-      // Cube doesn't get setup for pass2.
-      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
+      // LSTM doesn't get setup for pass2.
+      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
         word_res->SetupForRecognition(
               lang_t->unicharset, lang_t, BestPix(),
               lang_t->tessedit_ocr_engine_mode, NULL,
@@ -301,16 +302,6 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
                                 const TBOX* target_word_box,
                                 const char* word_config,
                                 int dopasses) {
-  // PSM_RAW_LINE is a special-case mode in which the layout analysis is
-  // completely ignored and LSTM is run on the raw image. There is no hope
-  // of running normal tesseract in this situation or of integrating output.
-#ifndef ANDROID_BUILD
-  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY &&
-      tessedit_pageseg_mode == PSM_RAW_LINE) {
-    RecogRawLine(page_res);
-    return true;
-  }
-#endif
   PAGE_RES_IT page_res_it(page_res);
 
   if (tessedit_minimal_rej_pass1) {
@@ -397,8 +388,7 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
     if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
   }
 
-  // The next passes can only be run if tesseract has been used, as cube
-  // doesn't set all the necessary outputs in WERD_RES.
+  // The next passes are only required for Tess-only.
   if (AnyTessLang() && !AnyLSTMLang()) {
     // ****************** Pass 3 *******************
     // Fix fuzzy spaces.
@@ -451,8 +441,13 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward()) {
     WERD_RES* word = page_res_it.word();
-    if (word->best_choice == NULL || word->best_choice->length() == 0)
+    POLY_BLOCK* pb = page_res_it.block()->block != NULL
+                         ? page_res_it.block()->block->poly_block()
+                         : NULL;
+    if (word->best_choice == NULL || word->best_choice->length() == 0 ||
+        (word->best_choice->IsAllSpaces() && (pb == NULL || pb->IsText()))) {
       page_res_it.DeleteCurrentWord();
+    }
   }
 
   if (monitor != NULL) {
@@ -1376,12 +1371,20 @@ void Tesseract::classify_word_pass1(const WordData& word_data,
     cube_word_pass1(block, row, *in_word);
     return;
   }
-  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
-    if (!(*in_word)->odd_size) {
+#endif
+#ifndef ANDROID_BUILD
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+    if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
       LSTMRecognizeWord(*block, row, *in_word, out_words);
       if (!out_words->empty())
         return;  // Successful lstm recognition.
     }
+    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+      // No fallback allowed, so use a fake.
+      (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
+      return;
+    }
     // Fall back to tesseract for failed words or odd words.
     (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
                                     OEM_TESSERACT_ONLY, NULL,
@@ -1523,7 +1526,7 @@ void Tesseract::classify_word_pass2(const WordData& word_data,
                                     WERD_RES** in_word,
                                     PointerVector<WERD_RES>* out_words) {
   // Return if we do not want to run Tesseract.
-  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
     return;
   }
   ROW* row = word_data.row;
@@ -1908,7 +1911,7 @@ static void find_modal_font(           //good chars in word
  * Get the fonts for the word.
  */
 void Tesseract::set_word_fonts(WERD_RES *word) {
-  // Don't try to set the word fonts for a cube word, as the configs
+  // Don't try to set the word fonts for an lstm word, as the configs
   // will be meaningless.
   if (word->chopped_word == NULL) return;
   ASSERT_HOST(word->best_choice != NULL);

diff --git a/ccmain/linerec.cpp b/ccmain/linerec.cpp
@@ -219,19 +219,6 @@ ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
 }
 
 #ifndef ANDROID_BUILD
-// Top-level function recognizes a single raw line.
-void Tesseract::RecogRawLine(PAGE_RES* page_res) {
-  PAGE_RES_IT it(page_res);
-  PointerVector<WERD_RES> words;
-  LSTMRecognizeWord(*it.block()->block, it.row()->row, it.word(), &words);
-  if (getDict().stopper_debug_level >= 1) {
-    for (int w = 0; w < words.size(); ++w) {
-      words[w]->DebugWordChoices(true, NULL);
-    }
-  }
-  it.ReplaceCurrentWord(&words);
-}
-
 // Recognizes a word or group of words, converting to WERD_RES in *words.
 // Analogous to classify_word_pass1, but can handle a group of words as well.
 void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
@@ -268,7 +255,17 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
   // for each of the output words.
   // If we drop a word as junk, then there is always a space in front of the
   // next.
-  bool deleted_prev = false;
+  const Dict* stopper_dict = lstm_recognizer_->GetDict();
+  if (stopper_dict == nullptr) stopper_dict = &getDict();
+  bool any_nonspace_delimited = false;
+  for (int w = 0; w < words->size(); ++w) {
+    WERD_RES* word = (*words)[w];
+    if (word->best_choice != nullptr &&
+        word->best_choice->ContainsAnyNonSpaceDelimited()) {
+      any_nonspace_delimited = true;
+      break;
+    }
+  }
   for (int w = 0; w < words->size(); ++w) {
     WERD_RES* word = (*words)[w];
     if (word->best_choice == NULL) {
@@ -284,9 +281,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
     }
     if (word->best_choice == NULL) {
       // It is a dud.
-      words->remove(w);
-      --w;
-      deleted_prev = true;
+      word->SetupFake(lstm_recognizer_->GetUnicharset());
     } else {
       // Set the best state.
       for (int i = 0; i < word->best_choice->length(); ++i) {
@@ -314,22 +309,21 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
         word->best_choice->print();
       }
       // Discard words that are impossibly bad, but allow a bit more for
-      // dictionary words.
+      // dictionary words, and keep bad words in non-space-delimited langs.
       if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
+          any_nonspace_delimited ||
           (word_certainty >= kWorstDictCertainty &&
            Dict::valid_word_permuter(word->best_choice->permuter(), true))) {
-        word->best_choice->set_certainty(word_certainty);
-        if (deleted_prev) word->word->set_blanks(1);
+        word->tess_accepted = stopper_dict->AcceptableResult(word);
       } else {
         if (getDict().stopper_debug_level >= 1) {
           tprintf("Deleting word with certainty %g\n", word_certainty);
           word->best_choice->print();
         }
         // It is a dud.
-        words->remove(w);
-        --w;
-        deleted_prev = true;
+        word->SetupFake(lstm_recognizer_->GetUnicharset());
       }
+      word->best_choice->set_certainty(word_certainty);
     }
   }
 }

diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp
@@ -161,7 +161,7 @@ bool Tesseract::init_tesseract_lang_data(
   // Determine which ocr engine(s) should be loaded and used for recognition.
   if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
   if (tessdata_manager_debug_level) {
-    tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
+    tprintf("Loading Tesseract/LSTM with tessedit_ocr_engine_mode %d\n",
             static_cast<int>(tessedit_ocr_engine_mode));
   }
 
@@ -174,9 +174,37 @@ bool Tesseract::init_tesseract_lang_data(
     return true;
   }
 
+// The various OcrEngineMode settings (see publictypes.h) determine which
+// engine-specific data files need to be loaded. Currently everything needs
+// the base tesseract data, which supplies other useful information, but
+// alternative engines, such as LSTM are optional.
+#ifndef ANDROID_BUILD
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+    if (tessdata_manager.swap()) {
+      tprintf("Error: LSTM requested on big-endian hardware!!\n");
+      tprintf("Big-endian not yet supported! Loading tesseract.\n");
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    } else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
+      lstm_recognizer_ = new LSTMRecognizer;
+      TFile fp;
+      fp.Open(tessdata_manager.GetDataFilePtr(), -1);
+      ASSERT_HOST(lstm_recognizer_->DeSerialize(tessdata_manager.swap(), &fp));
+      if (lstm_use_matrix)
+        lstm_recognizer_->LoadDictionary(tessdata_path.string(), language);
+    } else {
+      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    }
+  }
+#endif
+
   // Load the unicharset
-  if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
-      !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+    // Avoid requiring a unicharset when we aren't running base tesseract.
+    unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
+  } else if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
+             !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
     return false;
   }
   if (unicharset.size() > MAX_NUM_CLASSES) {
@@ -203,11 +231,6 @@ bool Tesseract::init_tesseract_lang_data(
         ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
     if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
   }
-
-  // The various OcrEngineMode settings (see publictypes.h) determine which
-  // engine-specific data files need to be loaded. Currently everything needs
-  // the base tesseract data, which supplies other useful information, but
-  // alternative engines, such as cube and LSTM are optional.
 #ifndef NO_CUBE_BUILD
   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
     ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
@@ -217,22 +240,6 @@ bool Tesseract::init_tesseract_lang_data(
     ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
     if (tessdata_manager_debug_level)
       tprintf("Loaded Cube with combiner\n");
-  } else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
-    if (tessdata_manager.swap()) {
-      tprintf("Error: LSTM requested on big-endian hardware!!\n");
-      tprintf("Big-endian not yet supported! Loading tesseract.\n");
-      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
-    } else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
-      lstm_recognizer_ = new LSTMRecognizer;
-      TFile fp;
-      fp.Open(tessdata_manager.GetDataFilePtr(), -1);
-      ASSERT_HOST(lstm_recognizer_->DeSerialize(tessdata_manager.swap(), &fp));
-      if (lstm_use_matrix)
-        lstm_recognizer_->LoadDictionary(tessdata_path.string(), language);
-    } else {
-      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
-      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
-    }
   }
 #endif
   // Init ParamsModel.
@@ -425,16 +432,16 @@ int Tesseract::init_tesseract_internal(
     tessdata_manager.End();
     return 0;
   }
-  // If only Cube will be used, skip loading Tesseract classifier's
-  // pre-trained templates.
-  bool init_tesseract_classifier =
-    tessedit_ocr_engine_mode != OEM_CUBE_ONLY;
-  // If only Cube will be used and if it has its own Unicharset,
-  // skip initializing permuter and loading Tesseract Dawgs.
-  bool init_dict =
-    !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
-      tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET));
-  program_editup(textbase, init_tesseract_classifier, init_dict);
+  // If only LSTM will be used, skip loading Tesseract classifier's
+  // pre-trained templates and dictionary.
+  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY &&
+                        tessedit_ocr_engine_mode != OEM_CUBE_ONLY;
+  bool init_dict = init_tesseract;
+  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
+      !tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)) {
+    init_dict = true;
+  }
+  program_editup(textbase, init_tesseract, init_dict);
   tessdata_manager.End();
   return 0;                      //Normal exit
 }

diff --git a/ccmain/tesseract_cube_combiner.cpp b/ccmain/tesseract_cube_combiner.cpp
@@ -21,6 +21,8 @@
 // the recognition results of Tesseract and Cube at the word level
 
 #include <algorithm>
+#include <string>
+#include <vector>
 #include <wctype.h>
 
 #include "tesseract_cube_combiner.h"
@@ -125,12 +127,10 @@ bool TesseractCubeCombiner::ValidWord(const string &str) {
 // Public method for computing the combiner features. The agreement
 // output parameter will be true if both answers are identical,
 // and false otherwise.
-bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
-                                                    int tess_confidence,
-                                                    CubeObject *cube_obj,
-                                                    WordAltList *cube_alt_list,
-                                                    vector<double> *features,
-                                                    bool *agreement) {
+bool TesseractCubeCombiner::ComputeCombinerFeatures(
+    const string &tess_str, int tess_confidence, CubeObject *cube_obj,
+    WordAltList *cube_alt_list, std::vector<double> *features,
+    bool *agreement) {
   features->clear();
   *agreement = false;
   if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0)

diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
@@ -81,9 +81,9 @@ Tesseract::Tesseract()
           " (Values from PageSegMode enum in publictypes.h)",
           this->params()),
       INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
-                      "Which OCR engine(s) to run (Tesseract, Cube, both)."
+                      "Which OCR engine(s) to run (Tesseract, LSTM, both)."
                       " Defaults to loading and running only Tesseract"
-                      " (no Cube,no combiner)."
+                      " (no LSTM,no combiner)."
                       " Values from OcrEngineMode enum in tesseractclass.h)",
                       this->params()),
       STRING_MEMBER(tessedit_char_blacklist, "",