Skip to content

Commit

Permalink
trying to add tessedit_char_whitelist etc. again:
Browse files Browse the repository at this point in the history
- ignore matrix outputs in ComputeTopN if they
  belong to a disabled unichar_id
- pass UNICHARSET refs to check that
- in SetBlackAndWhitelist, also update the unicharset
  of the lstm_recognizer_ instance, if any
  • Loading branch information
bertsky committed Mar 7, 2019
1 parent fe5c82f commit 6ac2ff0
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 4 deletions.
6 changes: 6 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,12 @@ void Tesseract::SetBlackAndWhitelist() {
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
if (lstm_recognizer_) {
UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
}
// Black and white lists should apply to all loaded classifiers.
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->unicharset.set_black_and_whitelist(
Expand Down
13 changes: 10 additions & 3 deletions src/lstm/recodebeam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
if (lstm_choice_mode)
timesteps.clear();
for (int t = 0; t < width; ++t) {
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0], charset);
DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
charset);
if (lstm_choice_mode) {
Expand All @@ -102,7 +102,7 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
beam_size_ = 0;
int width = output.dim1();
for (int t = 0; t < width; ++t) {
ComputeTopN(output[t], output.dim2(), kBeamWidths[0]);
ComputeTopN(output[t], output.dim2(), kBeamWidths[0], charset);
DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset);
}
}
Expand Down Expand Up @@ -456,12 +456,19 @@ WERD_RES* RecodeBeamSearch::InitializeWord(bool leading_space,
// Fills top_n_flags_ with bools that are true iff the corresponding output
// is one of the top_n.
void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
int top_n) {
int top_n, const UNICHARSET* charset) {
top_n_flags_.init_to_size(num_outputs, TN_ALSO_RAN);
top_code_ = -1;
second_code_ = -1;
top_heap_.clear();
for (int i = 0; i < num_outputs; ++i) {
// Decode label via recoder_.
RecodedCharID code;
code.Set(0, i);
int label = recoder_.DecodeUnichar(code);
if (label != INVALID_UNICHAR_ID && // not part of a bigger code.
!charset->get_enabled(label)) // disabled
continue;
if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key) {
TopPair entry(outputs[i], i);
top_heap_.Push(&entry);
Expand Down
2 changes: 1 addition & 1 deletion src/lstm/recodebeam.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ class RecodeBeamSearch {

// Fills top_n_flags_ with bools that are true iff the corresponding output
// is one of the top_n.
void ComputeTopN(const float* outputs, int num_outputs, int top_n);
void ComputeTopN(const float* outputs, int num_outputs, int top_n, const UNICHARSET* unicharset);

// Adds the computation for the current time-step to the beam. Call at each
// time-step in sequence from left to right. outputs is the activation vector
Expand Down

0 comments on commit 6ac2ff0

Please sign in to comment.