Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trying to add tessedit_char_whitelist etc. again: #2294

Merged
merged 8 commits into from
Apr 6, 2019
6 changes: 6 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,12 @@ void Tesseract::SetBlackAndWhitelist() {
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
if (lstm_recognizer_) {
bertsky marked this conversation as resolved.
Show resolved Hide resolved
UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
}
// Black and white lists should apply to all loaded classifiers.
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->unicharset.set_black_and_whitelist(
Expand Down
13 changes: 10 additions & 3 deletions src/lstm/recodebeam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
if (lstm_choice_mode)
timesteps.clear();
for (int t = 0; t < width; ++t) {
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0], charset);
DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
charset);
if (lstm_choice_mode) {
Expand All @@ -102,7 +102,7 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
beam_size_ = 0;
int width = output.dim1();
for (int t = 0; t < width; ++t) {
ComputeTopN(output[t], output.dim2(), kBeamWidths[0]);
ComputeTopN(output[t], output.dim2(), kBeamWidths[0], charset);
DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset);
}
}
Expand Down Expand Up @@ -456,12 +456,19 @@ WERD_RES* RecodeBeamSearch::InitializeWord(bool leading_space,
// Fills top_n_flags_ with bools that are true iff the corresponding output
// is one of the top_n.
void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
int top_n) {
int top_n, const UNICHARSET* charset) {
top_n_flags_.init_to_size(num_outputs, TN_ALSO_RAN);
top_code_ = -1;
second_code_ = -1;
top_heap_.clear();
for (int i = 0; i < num_outputs; ++i) {
// Decode label via recoder_.
RecodedCharID code;
code.Set(0, i);
int label = recoder_.DecodeUnichar(code);
if (label != INVALID_UNICHAR_ID && // not part of a bigger code.
!charset->get_enabled(label)) // disabled
continue;
if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key) {
TopPair entry(outputs[i], i);
top_heap_.Push(&entry);
Expand Down
2 changes: 1 addition & 1 deletion src/lstm/recodebeam.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ class RecodeBeamSearch {

// Fills top_n_flags_ with bools that are true iff the corresponding output
// is one of the top_n.
void ComputeTopN(const float* outputs, int num_outputs, int top_n);
void ComputeTopN(const float* outputs, int num_outputs, int top_n, const UNICHARSET* unicharset);

// Adds the computation for the current time-step to the beam. Call at each
// time-step in sequence from left to right. outputs is the activation vector
Expand Down