Added the option to get the timesteps separated by the suggested segm…

…entation Signed-off-by: Noah Metzger <[email protected]>
tesseract-ocr · Mar 11, 2019 · 754e38d · 754e38d
1 parent d2c3309
commit 754e38d
Show file tree

Hide file tree

Showing 8 changed files with 98 additions and 15 deletions.
diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp
@@ -130,7 +130,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
   if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
     return nullptr;
 
-  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
+  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
   int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
   bool para_is_ltr = true;        // Default direction is LTR
   const char* paragraph_lang = nullptr;
@@ -215,8 +215,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     // Now, process the word...
     std::vector<std::vector<std::pair<const char*, float>>>* confidencemap =
         nullptr;
+    std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+        symbolMap = nullptr;
     if (tesseract_->lstm_choice_mode) {
       confidencemap = res_it->GetBestLSTMSymbolChoices();
+      symbolMap = res_it->GetBestSegmentedLSTMSymbolChoices();
     }
     hocr_str << "\n      <span class='ocrx_word'"
              << " id='"
@@ -324,6 +327,38 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
           tcnt++;
         }
       }
+    } else if (tesseract_->lstm_choice_mode == 3 && symbolMap != nullptr) {
+      for (size_t j = 0; j < symbolMap->size(); j++) {
+        std::vector<std::vector<std::pair<const char*, float>>> timesteps =
+            (*symbolMap)[j];
+        hocr_str << "\n       <span class='ocr_symbol'"
+                 << " id='"
+                 << "symbolstep_" << page_id << "_" << wcnt << "_" << scnt
+                 << "'>"
+                 << timesteps[0][0].first;
+        for (size_t i = 1; i < timesteps.size(); i++) {
+          hocr_str << "\n        <span class='ocrx_cinfo'"
+                   << " id='"
+                   << "timestep_" << page_id << "_" << wcnt << "_" << tcnt
+                   << "'"
+                   << ">";
+          std::vector<std::pair<const char*, float>> timestep =
+              timesteps[i];
+          for (std::pair<const char*, float> conf : timestep) {
+            hocr_str << "<span class='ocr_glyph'"
+                     << " id='"
+                     << "choice_" << page_id << "_" << wcnt << "_" << gcnt
+                     << "'"
+                     << " title='x_confs " << int(conf.second * 100) << "'>"
+                     << conf.first << "</span>";
+            gcnt++;
+          }
+          hocr_str << "</span>";
+          tcnt++;
+        }
+        hocr_str << "</span>";
+        scnt++;
+      }
     }
     hocr_str << "</span>";
     tcnt = 1;

diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp
@@ -605,14 +605,24 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
   return result;
 }
 
-std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetBestLSTMSymbolChoices() const {
+std::vector<std::vector<std::pair<const char*, float>>>*
+  ResultIterator::GetBestLSTMSymbolChoices() const {
   if (it_->word() != nullptr) {
     return &it_->word()->timesteps;
   } else {
     return nullptr;
   }
 }
 
+std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+  ResultIterator::GetBestSegmentedLSTMSymbolChoices() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->symbol_steps;
+  } else {
+    return nullptr;
+  }
+}
+
 void ResultIterator::AppendUTF8WordText(STRING *text) const {
   if (!it_->word()) return;
   ASSERT_HOST(it_->word()->best_choice != nullptr);

diff --git a/src/ccmain/resultiterator.h b/src/ccmain/resultiterator.h
@@ -100,7 +100,10 @@ class TESS_API ResultIterator : public LTRResultIterator {
   /**
    * Returns the LSTM choices for every LSTM timestep for the current word.
   */
-  virtual std::vector<std::vector<std::pair<const char*, float>>>* GetBestLSTMSymbolChoices() const;
+  virtual std::vector<std::vector<std::pair<const char*, float>>>*
+    GetBestLSTMSymbolChoices() const;
+  virtual std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+    GetBestSegmentedLSTMSymbolChoices() const;
 
   /**
    * Return whether the current paragraph's dominant reading direction

diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
@@ -526,7 +526,9 @@ Tesseract::Tesseract()
           "Allows to include alternative symbols choices in the hOCR output. "
           "Valid input values are 0, 1 and 2. 0 is the default value. "
           "With 1 the alternative symbol choices per timestep are included. "
-          "With 2 the alternative symbol choices are accumulated per character.",
+          "With 2 the alternative symbol choices are accumulated per character."
+          "With 3 the alternative symbol choices per timestep are included and "
+          "separated by the suggested segmentation of Tesseract",
           this->params()),
 
       backup_config_file_(nullptr),

diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
@@ -1127,7 +1127,9 @@ class Tesseract : public Wordrec {
             "Allows to include alternative symbols choices in the hOCR output. "
             "Valid input values are 0, 1 and 2. 0 is the default value. "
             "With 1 the alternative symbol choices per timestep are included. "
-            "With 2 the alternative symbol choices are accumulated per character.");
+            "With 2 the alternative symbol choices are accumulated per character."
+            "With 3 the alternative symbol choices per timestep are included and "
+            "separated by the suggested segmentation of Tesseract");
 
   //// ambigsrecog.cpp /////////////////////////////////////////////////////////
   FILE *init_recog_training(const STRING &fname);

diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h
@@ -222,6 +222,8 @@ class WERD_RES : public ELIST_LINK {
   GenericVector<int> blob_gaps;
   // Stores the lstm choices of every timestep
   std::vector<std::vector<std::pair<const char*, float>>> timesteps;
+  std::vector<std::vector<std::vector<std::pair<const char*, float>>>>
+      symbol_steps;
   // Ratings matrix contains classifier choices for each classified combination
   // of blobs. The dimension is the same as the number of blobs in chopped_word
   // and the leading diagonal corresponds to classifier results of the blobs

diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp
@@ -25,6 +25,7 @@
 #include <deque>
 #include <map>
 #include <set>
+#include <tuple>
 #include <vector>
 
 #include <algorithm>
@@ -187,7 +188,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
   GenericVector<int> xcoords;
   GenericVector<const RecodeNode*> best_nodes;
   GenericVector<const RecodeNode*> second_nodes;
-  std::deque<std::pair<int,int>> best_choices;
+  std::deque<std::tuple<int, int, double>> best_choices;
   ExtractBestPaths(&best_nodes, &second_nodes);
   if (debug) {
     DebugPath(unicharset, best_nodes);
@@ -201,13 +202,14 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
   int timestepEnd = 0;
   //if lstm choice mode is required in granularity level 2 it stores the x
   //Coordinates of every chosen character to match the alternative choices to it
-  if (lstm_choice_mode == 2) {
+  if (lstm_choice_mode == 2 || lstm_choice_mode == 3) {
     ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,
                             &xcoords, &best_choices);
     if (best_choices.size() > 0) {
-      current_char = best_choices.front().first;
-      timestepEnd = best_choices.front().second;
-      best_choices.pop_front();
+      current_char = std::get<0>(best_choices.front());
+      timestepEnd = std::get<1>(best_choices.front());
+      if(lstm_choice_mode == 2)
+        best_choices.pop_front();
     }
   } else {
     ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,
@@ -258,7 +260,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
             choice_pairs.push_back(choice);
           }
         }
-        if ((best_choices.size() > 0 && i == best_choices.front().second - 1)
+        if ((best_choices.size() > 0 && i == std::get<1>(best_choices.front()) - 1)
             || i == xcoords[word_end]-1) {
           std::map<const char*, float> summed_propabilities;
           for (auto it = choice_pairs.begin(); it != choice_pairs.end(); ++it) {
@@ -283,7 +285,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
                                         it->second));
           }
           if (best_choices.size() > 0) {
-            current_char = best_choices.front().first;
+            current_char = std::get<0>(best_choices.front());
             best_choices.pop_front();
           }
           choice_pairs.clear();
@@ -292,6 +294,25 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
         }
       }
       timestepEnd = xcoords[word_end];
+    } else if (lstm_choice_mode == 3) {
+      std::vector<std::vector<std::pair<const char*, float>>> currentSymbol;
+      for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
+        if (i == std::get<1>(best_choices.front())) {
+          if (currentSymbol.size() > 0) {
+            word_res->symbol_steps.push_back(currentSymbol);
+            currentSymbol.clear();
+          }
+          std::vector<std::pair<const char*, float>> choice_Header;
+          choice_Header.push_back(std::pair<const char*, float>(
+              unicharset->id_to_unichar_ext(std::get<0>(best_choices.front())),
+                                            2.0));
+          currentSymbol.push_back(choice_Header);
+          if(best_choices.size()>1) best_choices.pop_front();
+        }
+        currentSymbol.push_back(timesteps[i]);
+      }
+      word_res->symbol_steps.push_back(currentSymbol);
+      timestepEnd = xcoords[word_end];
     }
     for (int i = word_start; i < word_end; ++i) {
       BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
@@ -366,7 +387,7 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
     const GenericVector<const RecodeNode*>& best_nodes,
     GenericVector<int>* unichar_ids, GenericVector<float>* certs,
     GenericVector<float>* ratings, GenericVector<int>* xcoords,
-    std::deque<std::pair<int, int>>* best_choices) {
+    std::deque<std::tuple<int, int, double>>* best_choices) {
   unichar_ids->truncate(0);
   certs->truncate(0);
   ratings->truncate(0);
@@ -375,6 +396,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
   int t = 0;
   int width = best_nodes.size();
   while (t < width) {
+    int id;
+    int tposition;
     double certainty = 0.0;
     double rating = 0.0;
     while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) {
@@ -396,7 +419,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
       unichar_ids->push_back(unichar_id);
       xcoords->push_back(t);
       if (best_choices != nullptr) {
-        best_choices->push_back(std::pair<int, int>(unichar_id, t));
+        tposition = t;
+        id = unichar_id;
       }
       do {
         double cert = best_nodes[t++]->certainty;
@@ -414,6 +438,10 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
       if (certainty < certs->back()) certs->back() = certainty;
       ratings->back() += rating;
     }
+    if (best_choices != nullptr) {
+      best_choices->push_back(
+          std::tuple<int, int, double>(id, tposition, rating));
+    }
   }
   xcoords->push_back(width);
 }

diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h
@@ -29,6 +29,7 @@
 #include "unicharcompress.h"
 #include <deque>
 #include <set>
+#include <tuple>
 #include <vector>
 
 namespace tesseract {
@@ -281,7 +282,7 @@ class RecodeBeamSearch {
       const GenericVector<const RecodeNode*>& best_nodes,
       GenericVector<int>* unichar_ids, GenericVector<float>* certs,
       GenericVector<float>* ratings, GenericVector<int>* xcoords,
-      std::deque<std::pair<int,int>>* best_choices = nullptr);
+      std::deque<std::tuple<int,int,double>>* best_choices = nullptr);
 
   // Sets up a word with the ratings matrix and fake blobs with boxes in the
   // right places.