Added a feature to enrich the hOCR output with glyph confidences

By using the parameter -c glyph_confidences=true the user is able to enrich the hOCR output with additional information. Tesseract then lists additionally the timesteps with all glyphs that were considered with their confidence for every timestep of the LSTM. The format of the hOCR output is slightly changed: There is now a linebreak after every word for better readability by humans. Signed-off-by: Noah Metzger <[email protected]>
tesseract-ocr · Jul 25, 2018 · 91c7504 · 91c7504
1 parent 607e8fd
commit 91c7504
Show file tree

Hide file tree

Showing 11 changed files with 138 additions and 16 deletions.
diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
@@ -49,6 +49,8 @@
 #include <fstream>             // for size_t
 #include <iostream>            // for std::cin
 #include <memory>              // for std::unique_ptr
+#include <set>                 // for std::pair
+#include <vector>              // for std::vector
 #include "allheaders.h"        // for pixDestroy, boxCreate, boxaAddBox, box...
 #include "blobclass.h"         // for ExtractFontName
 #include "boxword.h"           // for BoxWord
@@ -398,6 +400,7 @@ int TessBaseAPI::Init(const char* data, int data_size, const char* language,
       return -1;
     }
   }
+
   PERF_COUNT_SUB("update tesseract_")
   // Update datapath and language requested for the last valid initialization.
   if (datapath_ == nullptr)
@@ -1389,6 +1392,17 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
   *hocr_str += "'";
 }
 
+static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
+  int num2, int num3) {
+  const size_t BUFSIZE = 64;
+  char id_buffer[BUFSIZE];
+  snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3);
+  id_buffer[BUFSIZE - 1] = '\0';
+  *hocr_str += " id='";
+  *hocr_str += id_buffer;
+  *hocr_str += "'";
+}
+
 static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
                          STRING* hocr_str) {
   int left, top, right, bottom;
@@ -1449,7 +1463,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
   if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
     return nullptr;
 
-  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
+  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
   int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
   bool para_is_ltr = true;        // Default direction is LTR
   const char* paragraph_lang = nullptr;
@@ -1529,7 +1543,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     }
 
     // Now, process the word...
-    hocr_str += "<span class='ocrx_word'";
+    std::vector<std::vector<std::pair<const char*, float>>>* confidencemap = nullptr;
+    if (tesseract_->glyph_confidences) {
+      confidencemap = res_it->GetGlyphConfidences();
+    }
+    hocr_str += "\n      <span class='ocrx_word'";
     AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
     int left, top, right, bottom;
     bool bold, italic, underlined, monospace, serif, smallcaps;
@@ -1587,7 +1605,32 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
     if (italic) hocr_str += "</em>";
     if (bold) hocr_str += "</strong>";
-    hocr_str += "</span> ";
+    // If glyph confidence is required it is added here
+    if (tesseract_->glyph_confidences && confidencemap != nullptr) {
+      for (size_t i = 0; i < confidencemap->size(); i++) {
+        hocr_str += "\n       <span class='ocrx_cinfo'";
+        AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
+        hocr_str += ">";
+        //*
+        std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
+        for (std::pair<const char*, float> conf : timestep) {
+          hocr_str += "<span class='ocr_glyph'";
+          AddIdTohOCR(&hocr_str, "glyph", page_id, wcnt, gcnt);
+          hocr_str.add_str_int(" title='x_confs ", int(conf.second * 100));
+          hocr_str += "'";
+          hocr_str += ">";
+          hocr_str += conf.first;
+          hocr_str += "</span>";
+          gcnt++;
+        }
+        //*/
+        hocr_str += "</span>";
+        tcnt++;
+      }
+    }
+    hocr_str += "</span>";
+    tcnt = 1;
+    gcnt = 1;
     wcnt++;
     // Close any ending block/paragraph/textline.
     if (last_word_in_line) {

diff --git a/src/ccmain/linerec.cpp b/src/ccmain/linerec.cpp
@@ -239,7 +239,7 @@ void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
   if (im_data == nullptr) return;
   lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
                                   kWorstDictCertainty / kCertaintyScale,
-                                  word_box, words);
+                                  word_box, words, glyph_confidences);
   delete im_data;
   SearchWords(words);
 }

diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp
@@ -27,6 +27,8 @@
 #include "tesseractclass.h"
 #include "unicharset.h"
 #include "unicodes.h"
+#include <set>
+#include <vector>
 
 namespace tesseract {
 
@@ -602,6 +604,14 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
   return result;
 }
 
+std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetGlyphConfidences() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->timesteps;
+  } else {
+    return nullptr;
+  }
+}
+
 void ResultIterator::AppendUTF8WordText(STRING *text) const {
   if (!it_->word()) return;
   ASSERT_HOST(it_->word()->best_choice != nullptr);

diff --git a/src/ccmain/resultiterator.h b/src/ccmain/resultiterator.h
@@ -22,6 +22,8 @@
 #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
 #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
 
+#include <set>                  // for std::pair
+#include <vector>               // for std::vector
 #include "ltrresultiterator.h"  // for LTRResultIterator
 #include "platform.h"           // for TESS_API, TESS_LOCAL
 #include "publictypes.h"        // for PageIteratorLevel
@@ -95,6 +97,11 @@ class TESS_API ResultIterator : public LTRResultIterator {
   */
   virtual char* GetUTF8Text(PageIteratorLevel level) const;
 
+  /**
+   * Returns the glyph confidences for every LSTM timestep for the current Word
+  */
+  virtual std::vector<std::vector<std::pair<const char*, float>>>* GetGlyphConfidences() const;
+
   /**
    * Return whether the current paragraph's dominant reading direction
    * is left-to-right (as opposed to right-to-left).

diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
@@ -508,6 +508,9 @@ Tesseract::Tesseract()
       STRING_MEMBER(page_separator, "\f",
                     "Page separator (default is form feed control character)",
                     this->params()),
+      BOOL_MEMBER(glyph_confidences, false,
+                  "Allows to include glyph confidences in the hOCR output",
+                   this->params()),
 
       backup_config_file_(nullptr),
       pix_binary_(nullptr),

diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
@@ -1114,6 +1114,7 @@ class Tesseract : public Wordrec {
              "Preserve multiple interword spaces");
   STRING_VAR_H(page_separator, "\f",
                "Page separator (default is form feed control character)");
+  BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output");
 
   //// ambigsrecog.cpp /////////////////////////////////////////////////////////
   FILE *init_recog_training(const STRING &fname);

diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h
@@ -21,6 +21,8 @@
 #define PAGERES_H
 
 #include <cstdint>             // for int32_t, int16_t
+#include <set>                 // for std::pair
+#include <vector>              // for std::vector
 #include <sys/types.h>         // for int8_t
 #include "blamer.h"            // for BlamerBundle (ptr only), IRR_NUM_REASONS
 #include "clst.h"              // for CLIST_ITERATOR, CLISTIZEH
@@ -218,6 +220,8 @@ class WERD_RES : public ELIST_LINK {
   // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
   // blob i and blob i+1.
   GenericVector<int> blob_gaps;
+  // Stores the glyph confidences of every timestep of the lstm
+  std::vector<std::vector<std::pair<const char*, float>>> timesteps;
   // Ratings matrix contains classifier choices for each classified combination
   // of blobs. The dimension is the same as the number of blobs in chopped_word
   // and the leading diagonal corresponds to classifier results of the blobs

diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp
@@ -172,7 +172,7 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
 void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
                                    bool debug, double worst_dict_cert,
                                    const TBOX& line_box,
-                                   PointerVector<WERD_RES>* words) {
+                                   PointerVector<WERD_RES>* words, bool glyph_confidences) {
   NetworkIO outputs;
   float scale_factor;
   NetworkIO inputs;
@@ -183,9 +183,11 @@ void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
     search_ =
         new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_);
   }
-  search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, nullptr);
+  search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert,
+                  &GetUnicharset(), glyph_confidences);
   search_->ExtractBestPathAsWords(line_box, scale_factor, debug,
-                                  &GetUnicharset(), words);
+                                  &GetUnicharset(), words, 
+                                  glyph_confidences);
 }
 
 // Helper computes min and mean best results in the output.

diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h
@@ -184,7 +184,8 @@ class LSTMRecognizer {
   // will be used in a dictionary word.
   void RecognizeLine(const ImageData& image_data, bool invert, bool debug,
                      double worst_dict_cert, const TBOX& line_box,
-                     PointerVector<WERD_RES>* words);
+                     PointerVector<WERD_RES>* words, 
+                     bool glyph_confidences = false);
 
   // Helper computes min and mean best results in the output.
   void OutputStats(const NetworkIO& outputs,

diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp
@@ -22,6 +22,8 @@
 #include "networkio.h"
 #include "pageres.h"
 #include "unicharcompress.h"
+#include <set>
+#include <vector>
 
 #include <algorithm>
 
@@ -77,13 +79,18 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder,
 // Decodes the set of network outputs, storing the lattice internally.
 void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
                               double cert_offset, double worst_dict_cert,
-                              const UNICHARSET* charset) {
+                              const UNICHARSET* charset, bool glyph_confidence) {
   beam_size_ = 0;
   int width = output.Width();
+  if (glyph_confidence) 
+    timesteps.clear();
   for (int t = 0; t < width; ++t) {
     ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
     DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
                charset);
+    if (glyph_confidence) {
+      SaveMostCertainGlyphs(output.f(t), output.NumFeatures(), charset, t);
+    }
   }
 }
 void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
@@ -98,6 +105,35 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
   }
 }
 
+void RecodeBeamSearch::SaveMostCertainGlyphs(const float* outputs,
+                                             int num_outputs,
+                                             const UNICHARSET* charset,
+                                             int xCoord) {
+  std::vector<std::pair<const char*, float>> glyphs;
+  int pos = 0;
+  for (int i = 0; i < num_outputs; ++i) {
+    if (outputs[i] >= 0.01f) {
+      const char* charakter;
+      if (i + 2 >= num_outputs) {
+        charakter = "";
+      } else if (i > 0) {
+        charakter = charset->id_to_unichar_ext(i + 2);
+      } else {
+        charakter = charset->id_to_unichar_ext(i);
+      }
+      pos = 0;
+      //order the possible glyphs within one timestep
+      //beginning with the most likely
+      while (glyphs.size() > pos && glyphs[pos].second > outputs[i]) {
+        pos++;
+      }
+      glyphs.insert(glyphs.begin() + pos,
+                    std::pair<const char*, float>(charakter, outputs[i]));      
+    }
+  }
+  timesteps.push_back(glyphs);
+}
+
 // Returns the best path as labels/scores/xcoords similar to simple CTC.
 void RecodeBeamSearch::ExtractBestPathAsLabels(
     GenericVector<int>* labels, GenericVector<int>* xcoords) const {
@@ -140,7 +176,8 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(
 void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
                                               float scale_factor, bool debug,
                                               const UNICHARSET* unicharset,
-                                              PointerVector<WERD_RES>* words) {
+                                              PointerVector<WERD_RES>* words,
+                                              bool glyph_confidence) {
   words->truncate(0);
   GenericVector<int> unichar_ids;
   GenericVector<float> certs;
@@ -165,6 +202,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
   }
   // Convert labels to unichar-ids.
   int word_end = 0;
+  int timestepEnd = 0;
   float prev_space_cert = 0.0f;
   for (int word_start = 0; word_start < num_ids; word_start = word_end) {
     for (word_end = word_start + 1; word_end < num_ids; ++word_end) {
@@ -188,6 +226,12 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
     WERD_RES* word_res = InitializeWord(
         leading_space, line_box, word_start, word_end,
         std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor);
+    if (glyph_confidence) {
+      for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
+        word_res->timesteps.push_back(timesteps[i]);
+      }
+      timestepEnd = xcoords[word_end];
+    }
     for (int i = word_start; i < word_end; ++i) {
       BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
       BLOB_CHOICE_IT bc_it(choices);
@@ -381,7 +425,7 @@ void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
 void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
                                   double dict_ratio, double cert_offset,
                                   double worst_dict_cert,
-                                  const UNICHARSET* charset) {
+                                  const UNICHARSET* charset, bool debug) {
   if (t == beam_.size()) beam_.push_back(new RecodeBeam);
   RecodeBeam* step = beam_[t];
   beam_size_ = t + 1;
@@ -396,7 +440,7 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
     }
   } else {
     RecodeBeam* prev = beam_[t - 1];
-    if (charset != nullptr) {
+    if (debug) {
       int beam_index = BeamIndex(true, NC_ANYTHING, 0);
       for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
         GenericVector<const RecodeNode*> path;

diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h
@@ -28,6 +28,8 @@
 #include "networkio.h"
 #include "ratngs.h"
 #include "unicharcompress.h"
+#include <set>
+#include <vector>
 
 namespace tesseract {
 
@@ -182,7 +184,8 @@ class RecodeBeamSearch {
   // Decodes the set of network outputs, storing the lattice internally.
   // If charset is not null, it enables detailed debugging of the beam search.
   void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
-              double worst_dict_cert, const UNICHARSET* charset);
+              double worst_dict_cert, const UNICHARSET* charset,
+              bool glyph_confidence = false);
   void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
               double cert_offset, double worst_dict_cert,
               const UNICHARSET* charset);
@@ -201,11 +204,12 @@ class RecodeBeamSearch {
   // Returns the best path as a set of WERD_RES.
   void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
                               bool debug, const UNICHARSET* unicharset,
-                              PointerVector<WERD_RES>* words);
+                              PointerVector<WERD_RES>* words, bool glyph_confidence);
 
   // Generates debug output of the content of the beams after a Decode.
   void DebugBeams(const UNICHARSET& unicharset) const;
-
+
+  std::vector< std::vector<std::pair<const char*, float>>> timesteps;
   // Clipping value for certainty inside Tesseract. Reflects the minimum value
   // of certainty that will be returned by ExtractBestPathAsUnicharIds.
   // Supposedly on a uniform scale that can be compared across languages and
@@ -291,7 +295,10 @@ class RecodeBeamSearch {
   // for the current timestep.
   void DecodeStep(const float* outputs, int t, double dict_ratio,
                   double cert_offset, double worst_dict_cert,
-                  const UNICHARSET* charset);
+                  const UNICHARSET* charset, bool debug = false);
+
+  //Saves the most certain glyphs for the current time-step
+  void SaveMostCertainGlyphs(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord);
 
   // Adds to the appropriate beams the legal (according to recoder)
   // continuations of context prev, which is from the given index to beams_,