Skip to content

Commit

Permalink
Added a feature to enrich the hOCR output with glyph confidences
Browse files Browse the repository at this point in the history
By using the parameter -c glyph_confidences=true the user is able to enrich
the hOCR output with additional information. Tesseract then lists additionally
the timesteps with all glyphs that were considered with their confidence
for every timestep of the LSTM.

The format of the hOCR output is slightly changed: There is now a linebreak
after every word for better readability by humans.

Signed-off-by: Noah Metzger <[email protected]>
  • Loading branch information
noahmetzger committed Jul 25, 2018
1 parent 607e8fd commit 91c7504
Show file tree
Hide file tree
Showing 11 changed files with 138 additions and 16 deletions.
49 changes: 46 additions & 3 deletions src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
#include <fstream> // for size_t
#include <iostream> // for std::cin
#include <memory> // for std::unique_ptr
#include <set> // for std::pair
#include <vector> // for std::vector
#include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box...
#include "blobclass.h" // for ExtractFontName
#include "boxword.h" // for BoxWord
Expand Down Expand Up @@ -398,6 +400,7 @@ int TessBaseAPI::Init(const char* data, int data_size, const char* language,
return -1;
}
}

PERF_COUNT_SUB("update tesseract_")
// Update datapath and language requested for the last valid initialization.
if (datapath_ == nullptr)
Expand Down Expand Up @@ -1389,6 +1392,17 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
*hocr_str += "'";
}

static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
int num2, int num3) {
const size_t BUFSIZE = 64;
char id_buffer[BUFSIZE];
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3);
id_buffer[BUFSIZE - 1] = '\0';
*hocr_str += " id='";
*hocr_str += id_buffer;
*hocr_str += "'";
}

static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
STRING* hocr_str) {
int left, top, right, bottom;
Expand Down Expand Up @@ -1449,7 +1463,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
return nullptr;

int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool para_is_ltr = true; // Default direction is LTR
const char* paragraph_lang = nullptr;
Expand Down Expand Up @@ -1529,7 +1543,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
}

// Now, process the word...
hocr_str += "<span class='ocrx_word'";
std::vector<std::vector<std::pair<const char*, float>>>* confidencemap = nullptr;
if (tesseract_->glyph_confidences) {
confidencemap = res_it->GetGlyphConfidences();
}
hocr_str += "\n <span class='ocrx_word'";
AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
Expand Down Expand Up @@ -1587,7 +1605,32 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>";
hocr_str += "</span> ";
// If glyph confidence is required it is added here
if (tesseract_->glyph_confidences && confidencemap != nullptr) {
for (size_t i = 0; i < confidencemap->size(); i++) {
hocr_str += "\n <span class='ocrx_cinfo'";
AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
hocr_str += ">";
//*
std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
for (std::pair<const char*, float> conf : timestep) {
hocr_str += "<span class='ocr_glyph'";
AddIdTohOCR(&hocr_str, "glyph", page_id, wcnt, gcnt);
hocr_str.add_str_int(" title='x_confs ", int(conf.second * 100));
hocr_str += "'";
hocr_str += ">";
hocr_str += conf.first;
hocr_str += "</span>";
gcnt++;
}
//*/
hocr_str += "</span>";
tcnt++;
}
}
hocr_str += "</span>";
tcnt = 1;
gcnt = 1;
wcnt++;
// Close any ending block/paragraph/textline.
if (last_word_in_line) {
Expand Down
2 changes: 1 addition & 1 deletion src/ccmain/linerec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
if (im_data == nullptr) return;
lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
kWorstDictCertainty / kCertaintyScale,
word_box, words);
word_box, words, glyph_confidences);
delete im_data;
SearchWords(words);
}
Expand Down
10 changes: 10 additions & 0 deletions src/ccmain/resultiterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include "tesseractclass.h"
#include "unicharset.h"
#include "unicodes.h"
#include <set>
#include <vector>

namespace tesseract {

Expand Down Expand Up @@ -602,6 +604,14 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
return result;
}

std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetGlyphConfidences() const {
if (it_->word() != nullptr) {
return &it_->word()->timesteps;
} else {
return nullptr;
}
}

void ResultIterator::AppendUTF8WordText(STRING *text) const {
if (!it_->word()) return;
ASSERT_HOST(it_->word()->best_choice != nullptr);
Expand Down
7 changes: 7 additions & 0 deletions src/ccmain/resultiterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_

#include <set> // for std::pair
#include <vector> // for std::vector
#include "ltrresultiterator.h" // for LTRResultIterator
#include "platform.h" // for TESS_API, TESS_LOCAL
#include "publictypes.h" // for PageIteratorLevel
Expand Down Expand Up @@ -95,6 +97,11 @@ class TESS_API ResultIterator : public LTRResultIterator {
*/
virtual char* GetUTF8Text(PageIteratorLevel level) const;

/**
* Returns the glyph confidences for every LSTM timestep for the current Word
*/
virtual std::vector<std::vector<std::pair<const char*, float>>>* GetGlyphConfidences() const;

/**
* Return whether the current paragraph's dominant reading direction
* is left-to-right (as opposed to right-to-left).
Expand Down
3 changes: 3 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,9 @@ Tesseract::Tesseract()
STRING_MEMBER(page_separator, "\f",
"Page separator (default is form feed control character)",
this->params()),
BOOL_MEMBER(glyph_confidences, false,
"Allows to include glyph confidences in the hOCR output",
this->params()),

backup_config_file_(nullptr),
pix_binary_(nullptr),
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1114,6 +1114,7 @@ class Tesseract : public Wordrec {
"Preserve multiple interword spaces");
STRING_VAR_H(page_separator, "\f",
"Page separator (default is form feed control character)");
BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output");

//// ambigsrecog.cpp /////////////////////////////////////////////////////////
FILE *init_recog_training(const STRING &fname);
Expand Down
4 changes: 4 additions & 0 deletions src/ccstruct/pageres.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#define PAGERES_H

#include <cstdint> // for int32_t, int16_t
#include <set> // for std::pair
#include <vector> // for std::vector
#include <sys/types.h> // for int8_t
#include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS
#include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
Expand Down Expand Up @@ -218,6 +220,8 @@ class WERD_RES : public ELIST_LINK {
// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
// blob i and blob i+1.
GenericVector<int> blob_gaps;
// Stores the glyph confidences of every timestep of the lstm
std::vector<std::vector<std::pair<const char*, float>>> timesteps;
// Ratings matrix contains classifier choices for each classified combination
// of blobs. The dimension is the same as the number of blobs in chopped_word
// and the leading diagonal corresponds to classifier results of the blobs
Expand Down
8 changes: 5 additions & 3 deletions src/lstm/lstmrecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
bool debug, double worst_dict_cert,
const TBOX& line_box,
PointerVector<WERD_RES>* words) {
PointerVector<WERD_RES>* words, bool glyph_confidences) {
NetworkIO outputs;
float scale_factor;
NetworkIO inputs;
Expand All @@ -183,9 +183,11 @@ void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
search_ =
new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_);
}
search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, nullptr);
search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert,
&GetUnicharset(), glyph_confidences);
search_->ExtractBestPathAsWords(line_box, scale_factor, debug,
&GetUnicharset(), words);
&GetUnicharset(), words,
glyph_confidences);
}

// Helper computes min and mean best results in the output.
Expand Down
3 changes: 2 additions & 1 deletion src/lstm/lstmrecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ class LSTMRecognizer {
// will be used in a dictionary word.
void RecognizeLine(const ImageData& image_data, bool invert, bool debug,
double worst_dict_cert, const TBOX& line_box,
PointerVector<WERD_RES>* words);
PointerVector<WERD_RES>* words,
bool glyph_confidences = false);

// Helper computes min and mean best results in the output.
void OutputStats(const NetworkIO& outputs,
Expand Down
52 changes: 48 additions & 4 deletions src/lstm/recodebeam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "networkio.h"
#include "pageres.h"
#include "unicharcompress.h"
#include <set>
#include <vector>

#include <algorithm>

Expand Down Expand Up @@ -77,13 +79,18 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder,
// Decodes the set of network outputs, storing the lattice internally.
void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
double cert_offset, double worst_dict_cert,
const UNICHARSET* charset) {
const UNICHARSET* charset, bool glyph_confidence) {
beam_size_ = 0;
int width = output.Width();
if (glyph_confidence)
timesteps.clear();
for (int t = 0; t < width; ++t) {
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
charset);
if (glyph_confidence) {
SaveMostCertainGlyphs(output.f(t), output.NumFeatures(), charset, t);
}
}
}
void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
Expand All @@ -98,6 +105,35 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
}
}

void RecodeBeamSearch::SaveMostCertainGlyphs(const float* outputs,
int num_outputs,
const UNICHARSET* charset,
int xCoord) {
std::vector<std::pair<const char*, float>> glyphs;
int pos = 0;
for (int i = 0; i < num_outputs; ++i) {
if (outputs[i] >= 0.01f) {
const char* charakter;
if (i + 2 >= num_outputs) {
charakter = "";
} else if (i > 0) {
charakter = charset->id_to_unichar_ext(i + 2);
} else {
charakter = charset->id_to_unichar_ext(i);
}
pos = 0;
//order the possible glyphs within one timestep
//beginning with the most likely
while (glyphs.size() > pos && glyphs[pos].second > outputs[i]) {
pos++;
}
glyphs.insert(glyphs.begin() + pos,
std::pair<const char*, float>(charakter, outputs[i]));
}
}
timesteps.push_back(glyphs);
}

// Returns the best path as labels/scores/xcoords similar to simple CTC.
void RecodeBeamSearch::ExtractBestPathAsLabels(
GenericVector<int>* labels, GenericVector<int>* xcoords) const {
Expand Down Expand Up @@ -140,7 +176,8 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(
void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
float scale_factor, bool debug,
const UNICHARSET* unicharset,
PointerVector<WERD_RES>* words) {
PointerVector<WERD_RES>* words,
bool glyph_confidence) {
words->truncate(0);
GenericVector<int> unichar_ids;
GenericVector<float> certs;
Expand All @@ -165,6 +202,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
}
// Convert labels to unichar-ids.
int word_end = 0;
int timestepEnd = 0;
float prev_space_cert = 0.0f;
for (int word_start = 0; word_start < num_ids; word_start = word_end) {
for (word_end = word_start + 1; word_end < num_ids; ++word_end) {
Expand All @@ -188,6 +226,12 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
WERD_RES* word_res = InitializeWord(
leading_space, line_box, word_start, word_end,
std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor);
if (glyph_confidence) {
for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
word_res->timesteps.push_back(timesteps[i]);
}
timestepEnd = xcoords[word_end];
}
for (int i = word_start; i < word_end; ++i) {
BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
BLOB_CHOICE_IT bc_it(choices);
Expand Down Expand Up @@ -381,7 +425,7 @@ void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
double dict_ratio, double cert_offset,
double worst_dict_cert,
const UNICHARSET* charset) {
const UNICHARSET* charset, bool debug) {
if (t == beam_.size()) beam_.push_back(new RecodeBeam);
RecodeBeam* step = beam_[t];
beam_size_ = t + 1;
Expand All @@ -396,7 +440,7 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
}
} else {
RecodeBeam* prev = beam_[t - 1];
if (charset != nullptr) {
if (debug) {
int beam_index = BeamIndex(true, NC_ANYTHING, 0);
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
GenericVector<const RecodeNode*> path;
Expand Down
15 changes: 11 additions & 4 deletions src/lstm/recodebeam.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include "networkio.h"
#include "ratngs.h"
#include "unicharcompress.h"
#include <set>
#include <vector>

namespace tesseract {

Expand Down Expand Up @@ -182,7 +184,8 @@ class RecodeBeamSearch {
// Decodes the set of network outputs, storing the lattice internally.
// If charset is not null, it enables detailed debugging of the beam search.
void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
double worst_dict_cert, const UNICHARSET* charset);
double worst_dict_cert, const UNICHARSET* charset,
bool glyph_confidence = false);
void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
double cert_offset, double worst_dict_cert,
const UNICHARSET* charset);
Expand All @@ -201,11 +204,12 @@ class RecodeBeamSearch {
// Returns the best path as a set of WERD_RES.
void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
bool debug, const UNICHARSET* unicharset,
PointerVector<WERD_RES>* words);
PointerVector<WERD_RES>* words, bool glyph_confidence);

// Generates debug output of the content of the beams after a Decode.
void DebugBeams(const UNICHARSET& unicharset) const;


std::vector< std::vector<std::pair<const char*, float>>> timesteps;
// Clipping value for certainty inside Tesseract. Reflects the minimum value
// of certainty that will be returned by ExtractBestPathAsUnicharIds.
// Supposedly on a uniform scale that can be compared across languages and
Expand Down Expand Up @@ -291,7 +295,10 @@ class RecodeBeamSearch {
// for the current timestep.
void DecodeStep(const float* outputs, int t, double dict_ratio,
double cert_offset, double worst_dict_cert,
const UNICHARSET* charset);
const UNICHARSET* charset, bool debug = false);

//Saves the most certain glyphs for the current time-step
void SaveMostCertainGlyphs(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord);

// Adds to the appropriate beams the legal (according to recoder)
// continuations of context prev, which is from the given index to beams_,
Expand Down

0 comments on commit 91c7504

Please sign in to comment.