diff --git a/ccmain/Makefile.am b/ccmain/Makefile.am index 9d7326cc89..8fdebedbed 100644 --- a/ccmain/Makefile.am +++ b/ccmain/Makefile.am @@ -9,10 +9,11 @@ AM_CPPFLAGS = \ -I$(top_srcdir)/textord include_HEADERS = \ - control.h cube_reco_context.h \ - docqual.h fixspace.h \ - imgscale.h osdetect.h output.h \ - paramsd.h pgedit.h reject.h scaleimg.h \ + control.h cube_reco_context.h cubeclassifier.h \ + docqual.h equationdetect.h fixspace.h \ + imgscale.h ltrresultiterator.h mutableiterator.h osdetect.h output.h \ + pageiterator.h paragraphs.h paragraphs_internal.h paramsd.h pgedit.h \ + reject.h resultiterator.h scaleimg.h \ tessbox.h tessedit.h tesseractclass.h \ tesseract_cube_combiner.h \ tessvars.h tfacep.h tfacepp.h thresholder.h \ @@ -38,11 +39,13 @@ endif libtesseract_main_la_SOURCES = \ adaptions.cpp applybox.cpp \ - control.cpp cube_control.cpp cube_reco_context.cpp \ - docqual.cpp fixspace.cpp fixxht.cpp \ - imgscale.cpp osdetect.cpp output.cpp pagesegmain.cpp \ - pagewalk.cpp paramsd.cpp pgedit.cpp reject.cpp scaleimg.cpp \ - recogtraining.cpp tesseract_cube_combiner.cpp \ + control.cpp cube_control.cpp cube_reco_context.cpp cubeclassifier.cpp \ + docqual.cpp equationdetect.cpp fixspace.cpp fixxht.cpp \ + imgscale.cpp ltrresultiterator.cpp \ + osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \ + pagewalk.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \ + reject.cpp resultiterator.cpp scaleimg.cpp \ + tesseract_cube_combiner.cpp \ tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \ tfacepp.cpp thresholder.cpp \ werdit.cpp diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp index 051507f814..2d183eb3ff 100644 --- a/ccmain/applybox.cpp +++ b/ccmain/applybox.cpp @@ -35,9 +35,13 @@ #include "unichar.h" #include "unicharset.h" #include "tesseractclass.h" +#include "genericvector.h" // Max number of blobs to classify together in FindSegmentation. const int kMaxGroupSize = 4; +// Max fraction of median allowed as deviation in xheight before switching +// to median. +const double kMaxXHeightDeviationFraction = 0.125; /************************************************************************* * The box file is assumed to contain box definitions, one per line, of the @@ -107,69 +111,93 @@ static void clear_any_old_text(BLOCK_LIST *block_list) { PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname, bool find_segmentation, BLOCK_LIST *block_list) { - // In word mode, we use the boxes to make a word for each box, but - // in blob mode we use the existing words and maximally chop them first. - PAGE_RES* page_res = find_segmentation ? NULL : SetupApplyBoxes(block_list); int box_count = 0; int box_failures = 0; FILE* box_file = OpenBoxFile(fname); - clear_any_old_text(block_list); - TBOX prev_box, box, next_box; - bool found_box = false; - char text[kBoxReadBufSize]; - do { - prev_box = box; - box = next_box; + TBOX box; + GenericVector boxes; + GenericVector texts, full_texts; + + bool found_box = true; + while (found_box) { int line_number = 0; // Line number of the box file. - int x_min; - int y_min; - int x_max; - int y_max; - char next_text[kBoxReadBufSize]; - // Keep a look-ahead box, so we can pass the next box into the resegment - // functions. - found_box = read_next_box(applybox_page, &line_number, box_file, next_text, - &x_min, &y_min, &x_max, &y_max); + STRING text, full_text; + found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box); if (found_box) { - next_box = TBOX(ICOORD(x_min, y_min), ICOORD (x_max, y_max)); ++box_count; + MakeBoxFileStr(text.string(), box, applybox_page, &full_text); } else { - next_box = TBOX(); - next_text[0] = '\0'; + full_text = ""; } - if (!box.null_box()) { - bool foundit = false; - if (page_res != NULL) - foundit = ResegmentCharBox(page_res, box, next_box, text); - else - foundit = ResegmentWordBox(block_list, box, next_box, text); - if (!foundit) { - box_failures++; - ReportFailedBox(box_count, box, text, - "FAILURE! Couldn't find a matching blob"); + boxes.push_back(box); + texts.push_back(text); + full_texts.push_back(full_text); + } + + // In word mode, we use the boxes to make a word for each box, but + // in blob mode we use the existing words and maximally chop them first. + PAGE_RES* page_res = find_segmentation ? + NULL : SetupApplyBoxes(boxes, block_list); + clear_any_old_text(block_list); + + for (int i = 0; i < boxes.size() - 1; i++) { + bool foundit = false; + if (page_res != NULL) { + if (i == 0) { + foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1], + full_texts[i].string()); + } else { + foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i], + boxes[i + 1], full_texts[i].string()); } + } else { + foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1], + texts[i].string()); + } + if (!foundit) { + box_failures++; + ReportFailedBox(box_count, boxes[i], texts[i].string(), + "FAILURE! Couldn't find a matching blob"); } - strcpy(text, next_text); - } while (found_box); + } + if (page_res == NULL) { // In word/line mode, we now maximally chop all the words and resegment // them with the classifier. - page_res = SetupApplyBoxes(block_list); + page_res = SetupApplyBoxes(boxes, block_list); ReSegmentByClassification(page_res); } if (applybox_debug > 0) { tprintf("APPLY_BOXES:\n"); tprintf(" Boxes read from boxfile: %6d\n", box_count); - tprintf(" Boxes failed resegmentation: %6d\n", box_failures); + if (box_failures > 0) + tprintf(" Boxes failed resegmentation: %6d\n", box_failures); } TidyUp(page_res); return page_res; } +// Helper computes median xheight in the image. +static double MedianXHeight(BLOCK_LIST *block_list) { + BLOCK_IT block_it(block_list); + STATS xheights(0, block_it.data()->bounding_box().height()); + for (block_it.mark_cycle_pt(); + !block_it.cycled_list(); block_it.forward()) { + ROW_IT row_it(block_it.data()->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + xheights.add(IntCastRounded(row_it.data()->x_height()), 1); + } + } + return xheights.median(); +} + // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: // All fuzzy spaces are removed, and all the words are maximally chopped. -PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) { +PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector& boxes, + BLOCK_LIST *block_list) { + double median_xheight = MedianXHeight(block_list); + double max_deviation = kMaxXHeightDeviationFraction * median_xheight; // Strip all fuzzy space markers to simplify the PAGE_RES. BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { @@ -177,6 +205,14 @@ PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) { ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { ROW* row = r_it.data(); + float diff = fabs(row->x_height() - median_xheight); + if (diff > max_deviation) { + if (applybox_debug) { + tprintf("row xheight=%g, but median xheight = %g\n", + row->x_height(), median_xheight); + } + row->set_x_height(static_cast(median_xheight)); + } WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); @@ -193,7 +229,8 @@ PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) { PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; while ((word_res = pr_it.word()) != NULL) { - MaximallyChopWord(pr_it.block()->block, pr_it.row()->row, word_res); + MaximallyChopWord(boxes, pr_it.block()->block, + pr_it.row()->row, word_res); pr_it.forward(); } return page_res; @@ -204,6 +241,7 @@ PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) { static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices, const UNICHARSET& unicharset, WERD_CHOICE* word_choice) { + *word_choice = WERD_CHOICE(&unicharset); // clear the word choice. word_choice->make_bad(); for (int i = 0; i < char_choices.size(); ++i) { BLOB_CHOICE_IT it(char_choices[i]); @@ -211,15 +249,21 @@ static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices, word_choice->append_unichar_id(bc->unichar_id(), 1, bc->rating(), bc->certainty()); } - word_choice->populate_unichars(unicharset); + word_choice->populate_unichars(); } // Tests the chopper by exhaustively running chop_one_blob. // The word_res will contain filled chopped_word, seam_array, denorm, // box_word and best_state for the maximally chopped word. -void Tesseract::MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res) { - if (!word_res->SetupForRecognition(unicharset, false, row, block)) +void Tesseract::MaximallyChopWord(const GenericVector& boxes, + BLOCK* block, ROW* row, + WERD_RES* word_res) { + if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false, + this->textord_use_cjk_fp_model, + row, block)) { + word_res->CloneChoppedToRebuild(); return; + } if (chop_debug) { tprintf("Maximally chopping word at:"); word_res->word->bounding_box().print(); @@ -227,7 +271,6 @@ void Tesseract::MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res) { blob_match_table.init_match_table(); BLOB_CHOICE_LIST *match_result; BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); - set_denorm(&word_res->denorm); ASSERT_HOST(word_res->chopped_word->blobs != NULL); float rating = static_cast(MAX_INT8); for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL; @@ -248,8 +291,16 @@ void Tesseract::MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res) { } inT32 blob_number; int right_chop_index = 0; - while (chop_one_blob(word_res->chopped_word, char_choices, - &blob_number, &word_res->seam_array, &right_chop_index)); + if (!assume_fixed_pitch_char_segment) { + // We only chop if the language is not fixed pitch like CJK. + if (prioritize_division) { + while (chop_one_blob2(boxes, word_res, &word_res->seam_array)); + } else { + while (chop_one_blob(word_res->chopped_word, char_choices, + &blob_number, &word_res->seam_array, + &right_chop_index)); + } + } MakeWordChoice(*char_choices, unicharset, word_res->best_choice); MakeWordChoice(*char_choices, unicharset, word_res->raw_choice); word_res->CloneChoppedToRebuild(); @@ -288,7 +339,7 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) { // failing to find an appropriate blob for a box. // This means that occasionally, blobs may be incorrectly segmented if the // chopper fails to find a suitable chop point. -bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, +bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { @@ -306,6 +357,7 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, } int word_len = word_res->box_word->length(); for (int i = 0; i < word_len; ++i) { + TBOX char_box = TBOX(); int blob_count = 0; for (blob_count = 0; i + blob_count < word_len; ++blob_count) { TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); @@ -323,8 +375,17 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, } if (current_box_miss_metric > next_box_miss_metric) break; // Blob is a better match for next box. + char_box += blob_box; } if (blob_count > 0) { + if (applybox_debug > 1) { + tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); + } + if (!char_box.almost_equal(box, 3) && + (box.x_gap(next_box) < -3 || + (prev_box != NULL && prev_box->x_gap(box) < -3))) { + return false; + } // We refine just the box_word, best_state and correct_text here. // The rebuild_word is made in TidyUp. // blob_count blobs are put together to match the box. Merge the @@ -354,11 +415,19 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, tprintf("%d ", word_res->best_state[j]); } tprintf("\n"); + tprintf("Correct text = [[ "); + for (int j = 0; j < word_res->correct_text.size(); ++j) { + tprintf("%s ", word_res->correct_text[j].string()); + } + tprintf("]]\n"); } return true; } } } + if (applybox_debug > 0) { + tprintf("FAIL!\n"); + } return false; // Failure. } @@ -433,6 +502,7 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, } } } + if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; } @@ -498,8 +568,8 @@ bool Tesseract::FindSegmentation(const GenericVector& target_text, for (int i = 0; i < word_length; ++i) { for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { BLOB_CHOICE_LIST* match_result = classify_piece( - word_res->chopped_word->blobs, word_res->seam_array, - i, i + j - 1); + word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array, + i, i + j - 1, word_res->blamer_bundle); if (applybox_debug > 2) { tprintf("%d+%d:", i, j); print_ratings_list("Segment:", match_result, unicharset); @@ -629,7 +699,7 @@ void Tesseract::SearchForText(const GenericVector* choices, // Counts up the labelled words and the blobs within. // Deletes all unused or emptied words, counting the unused ones. // Resets W_BOL and W_EOL flags correctly. -// Builds the rebuild_word and rebuilds the box_word. +// Builds the rebuild_word and rebuilds the box_word and the best_choice. void Tesseract::TidyUp(PAGE_RES* page_res) { int ok_blob_count = 0; int bad_blob_count = 0; @@ -639,14 +709,21 @@ void Tesseract::TidyUp(PAGE_RES* page_res) { WERD_RES* word_res; for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { int ok_in_word = 0; - for (int i = 0; i < word_res->correct_text.size(); ++i) { + BLOB_CHOICE_LIST_VECTOR char_choices; + for (int i = word_res->correct_text.size() - 1; i >= 0; i--) { if (word_res->correct_text[i].length() > 0) { ++ok_in_word; } + // Since we only need a fake word_res->best_choice, the actual + // unichar_ids do not matter. Which is fortunate, since TidyUp() + // can be called while training Tesseract, at the stage where + // unicharset is not meaningful yet. + char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0); } if (ok_in_word > 0) { ok_blob_count += ok_in_word; bad_blob_count += word_res->correct_text.size() - ok_in_word; + MakeWordChoice(char_choices, unicharset, word_res->best_choice); } else { ++unlabelled_words; if (applybox_debug > 0) { @@ -655,6 +732,7 @@ void Tesseract::TidyUp(PAGE_RES* page_res) { } pr_it.DeleteCurrentWord(); } + char_choices.delete_data_pointers(); } pr_it.restart_page(); for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { @@ -665,9 +743,13 @@ void Tesseract::TidyUp(PAGE_RES* page_res) { word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row()); } if (applybox_debug > 0) { - tprintf(" Found %d good blobs and %d unlabelled blobs in %d words.\n", - ok_blob_count, bad_blob_count, ok_word_count); - tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); + tprintf(" Found %d good blobs.\n", ok_blob_count); + if (bad_blob_count > 0) { + tprintf(" Leaving %d unlabelled blobs in %d words.\n", + bad_blob_count, ok_word_count); + } + if (unlabelled_words > 0) + tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); } } @@ -684,13 +766,17 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { PAGE_RES_IT pr_it(page_res); for (WERD_RES *word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { - WERD_CHOICE* choice = new WERD_CHOICE(word_res->correct_text.size()); + WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set, + word_res->correct_text.size()); for (int i = 0; i < word_res->correct_text.size(); ++i) { - UNICHAR_ID char_id = unicharset.unichar_to_id( - word_res->correct_text[i].string()); + // The part before the first space is the real ground truth, and the + // rest is the bounding box location and page number. + GenericVector tokens; + word_res->correct_text[i].split(' ', &tokens); + UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string()); choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f); } - choice->populate_unichars(unicharset); + choice->populate_unichars(); if (word_res->best_choice != NULL) delete word_res->best_choice; word_res->best_choice = choice; @@ -707,7 +793,7 @@ void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) { LearnWord(filename.string(), NULL, word_res); ++word_count; } - tprintf ("Generated training data for %d words\n", word_count); + tprintf("Generated training data for %d words\n", word_count); } diff --git a/ccmain/control.cpp b/ccmain/control.cpp index 1f12eb8d5c..8313e9d717 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -98,7 +98,8 @@ BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) { inT16 char_qual; inT16 good_char_qual; - classify_word_pass2(word_res, block, row); + classify_word_and_language(&Tesseract::classify_word_pass2, + block, row, word_res); if (tessedit_debug_quality_metrics) { word_char_quality(word_res, row, &char_qual, &good_char_qual); tprintf @@ -121,6 +122,9 @@ BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) { // Note that this function uses a fixed temporary file for storing the previous // configs, so it is neither thread-safe, nor process-safe, but the assumption // is that it will only be used for one debug window at a time. +// +// Since this function is used for debugging (and not to change OCR results) +// set only debug params from the word config file. bool Tesseract::ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box, const char* word_config, @@ -132,11 +136,15 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box, FILE* config_fp = fopen(backup_config_file_, "wb"); ParamUtils::PrintParams(config_fp, params()); fclose(config_fp); - ParamUtils::ReadParamsFile(word_config, false, params()); + ParamUtils::ReadParamsFile(word_config, + SET_PARAM_CONSTRAINT_DEBUG_ONLY, + params()); } } else { if (backup_config_file_ != NULL) { - ParamUtils::ReadParamsFile(backup_config_file_, false, params()); + ParamUtils::ReadParamsFile(backup_config_file_, + SET_PARAM_CONSTRAINT_DEBUG_ONLY, + params()); backup_config_file_ = NULL; } } @@ -158,6 +166,7 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box, * if word_config is not null, the word config file is read for just the * target word(s), otherwise, on pass 2 and beyond ONLY the target words * are processed (Jetsoft modification.) + * Returns false if we cancelled prematurely. * * @param page_res page structure * @param monitor progress monitor @@ -165,49 +174,28 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box, * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher */ -void Tesseract::recog_all_words(PAGE_RES* page_res, +bool Tesseract::recog_all_words(PAGE_RES* page_res, ETEXT_DESC* monitor, const TBOX* target_word_box, const char* word_config, int dopasses) { - // TODO(rays): Normalize the "classify word" interface. For instance: - // (1) word.denorm gets set in word->SetupForRecognition() but does - // not get invoked for cube alone. Maybe it should? - // (2) run_cube() checks whether word->best_choice is NULL, and if - // so determines that "neither cube nor tess have an answer." - // However, if tess gets run at all, the first thing it does is - // call word->SetupForRecognition which inserts a poorly scoring - // best_answer. So what is the way that an engine (tess or cube) - // says "I don't have an answer": an empty list or a single - // poorly scoring best_answer? - - // reset page iterator - // If we only intend to run cube - run it and return. - if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { - PrepareForCubeOCR(); - mutable_splitter()->Clear(); - run_cube(page_res); - return; - } - // Return if we do not want to run Tesseract. - if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && - tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED) return; - PAGE_RES_IT page_res_it; - inT16 chars_in_word; - inT16 rejects_in_word; - inT16 blob_quality = 0; - inT16 outline_errs = 0; - inT16 all_char_quality; - inT16 accepted_all_char_quality; inT32 word_index; // current word - int i; if (tessedit_minimal_rej_pass1) { tessedit_test_adaption.set_value (TRUE); tessedit_minimal_rejection.set_value (TRUE); } + // Before the main recognition loop below, walk through the whole page and set + // up fake words. That way, if we run out of time a user will still get the + // expected best_choice and box_words out the end; they'll just be empty. + page_res_it.page_res = page_res; + for (page_res_it.restart_page(); page_res_it.word() != NULL; + page_res_it.forward()) { + page_res_it.word()->SetupFake(unicharset); + } + if (dopasses==0 || dopasses==1) { page_res_it.page_res=page_res; page_res_it.restart_page(); @@ -216,11 +204,16 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, // Clear adaptive classifier at the beginning of the page if it is full. // This is done only at the beginning of the page to ensure that the - // classifier is not reset at an arbitraty point while processing the page, + // classifier is not reset at an arbitrary point while processing the page, // which would cripple Passes 2+ if the reset happens towards the end of - // Pass 1 on a page with very difficul text. + // Pass 1 on a page with very difficult text. // TODO(daria): preemptively clear the classifier if it is almost full. - if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifier(); + if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifierInternal(); + // Now check the sub-langs as well. + for (int i = 0; i < sub_langs_.size(); ++i) { + if (sub_langs_[i]->AdaptiveClassifierIsFull()) + sub_langs_[i]->ResetAdaptiveClassifierInternal(); + } stats_.word_count = 0; if (monitor != NULL) { @@ -243,6 +236,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, stats_.good_char_count = 0; stats_.doc_good_char_quality = 0; + most_recently_used_ = this; while (page_res_it.word() != NULL) { set_global_loc_code(LOC_PASS1); word_index++; @@ -252,7 +246,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, if (monitor->deadline_exceeded() || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) - return; + return false; } if (target_word_box && !ProcessTargetWord(page_res_it.word()->word->bounding_box(), @@ -260,8 +254,10 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, page_res_it.forward(); continue; } - classify_word_pass1(page_res_it.word(), page_res_it.row()->row, - page_res_it.block()->block); + classify_word_and_language(&Tesseract::classify_word_pass1, + page_res_it.block()->block, + page_res_it.row()->row, + page_res_it.word()); if (page_res_it.word()->word->flag(W_REP_CHAR)) { fix_rep_char(&page_res_it); page_res_it.forward(); @@ -271,8 +267,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, word_dumper(NULL, page_res_it.row()->row, page_res_it.word()); tprintf("Pass1: %s [%s]\n", page_res_it.word()->best_choice->unichar_string().string(), - page_res_it.word()->best_choice-> - debug_string(unicharset).string()); + page_res_it.word()->best_choice->debug_string().string()); } // tessedit_test_adaption enables testing of the accuracy of the @@ -285,7 +280,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, } else { // Override rejection mechanisms for this word. UNICHAR_ID space = unicharset.unichar_to_id(" "); - for (i = 0; i < page_res_it.word()->best_choice->length(); i++) { + for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) { if ((page_res_it.word()->best_choice->unichar_id(i) != space) && page_res_it.word()->reject_map[i].rejected()) page_res_it.word()->reject_map[i].setrej_minimal_rej_accept(); @@ -296,15 +291,25 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, // Count dict words. if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) ++(stats_.dict_words); + + // Update misadaption log (we only need to do it on pass 1, since + // adaption only happens on this pass). + if (page_res_it.word()->blamer_bundle != NULL && + page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) { + page_res->misadaption_log.push_back( + page_res_it.word()->blamer_bundle->misadaption_debug); + } + page_res_it.forward(); } } - if (dopasses == 1) return; + if (dopasses == 1) return true; // ****************** Pass 2 ******************* page_res_it.restart_page(); word_index = 0; + most_recently_used_ = this; while (!tessedit_test_adaption && page_res_it.word() != NULL) { set_global_loc_code(LOC_PASS2); word_index++; @@ -314,7 +319,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, if (monitor->deadline_exceeded() || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) - return; + return false; } // changed by jetsoft @@ -327,8 +332,10 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, } // end jetsoft - classify_word_pass2(page_res_it.word(), page_res_it.block()->block, - page_res_it.row()->row); + classify_word_and_language(&Tesseract::classify_word_pass2, + page_res_it.block()->block, + page_res_it.row()->row, + page_res_it.word()); if (page_res_it.word()->word->flag(W_REP_CHAR) && !page_res_it.word()->done) { fix_rep_char(&page_res_it); @@ -339,37 +346,279 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, word_dumper(NULL, page_res_it.row()->row, page_res_it.word()); tprintf("Pass2: %s [%s]\n", page_res_it.word()->best_choice->unichar_string().string(), - page_res_it.word()->best_choice-> - debug_string(unicharset).string()); + page_res_it.word()->best_choice->debug_string().string()); } page_res_it.forward(); } - // ****************** Pass 3 ******************* - // Fix fuzzy spaces. - set_global_loc_code(LOC_FUZZY_SPACE); + // The next passes can only be run if tesseract has been used, as cube + // doesn't set all the necessary outputs in WERD_RES. + if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || + tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { + // ****************** Pass 3 ******************* + // Fix fuzzy spaces. + set_global_loc_code(LOC_FUZZY_SPACE); + + if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces + && !tessedit_word_for_word && !right_to_left()) + fix_fuzzy_spaces(monitor, stats_.word_count, page_res); + + // ****************** Pass 4 ******************* + if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res); + + // ****************** Pass 5,6 ******************* + rejection_passes(page_res, monitor, target_word_box, word_config); + + // ****************** Pass 7 ******************* + // Cube combiner. + // If cube is loaded and its combiner is present, run it. + if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { + run_cube_combiner(page_res); + } + + // ****************** Pass 8 ******************* + font_recognition_pass(page_res); + + // ****************** Pass 9 ******************* + // Check the correctness of the final results. + blamer_pass(page_res); + } + + if (!save_blob_choices) { + // We aren't saving the blob choices so get rid of them now. + // set_blob_choices() does a deep clear. + page_res_it.restart_page(); + while (page_res_it.word() != NULL) { + WERD_RES* word = page_res_it.word(); + word->best_choice->set_blob_choices(NULL); + page_res_it.forward(); + } + } + + // Write results pass. + set_global_loc_code(LOC_WRITE_RESULTS); + // This is now redundant, but retained commented so show how to obtain + // bounding boxes and style information. - if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces - && !tessedit_word_for_word && !right_to_left()) - fix_fuzzy_spaces(monitor, stats_.word_count, page_res); + // changed by jetsoft + // needed for dll to output memory structure + if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) + output_pass(page_res_it, target_word_box); + // end jetsoft + PageSegMode pageseg_mode = static_cast( + static_cast(tessedit_pageseg_mode)); + textord_.CleanupSingleRowResult(pageseg_mode, page_res); - // ****************** Pass 4 ******************* + if (monitor != NULL) { + monitor->progress = 100; + } + return true; +} + +void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { + PAGE_RES_IT word_it(page_res); + + WERD_RES *w_prev = NULL; + WERD_RES *w = word_it.word(); + if (w && w->best_choice) w->best_choice->populate_unichars(); + while (1) { + w_prev = w; + while (word_it.forward() != NULL && + (!word_it.word() || word_it.word()->part_of_combo)) { + // advance word_it, skipping over parts of combos + } + if (!word_it.word()) break; + w = word_it.word(); + if (w && w->best_choice) + w->best_choice->populate_unichars(); + if (!w || !w_prev || w->uch_set != w_prev->uch_set) { + continue; + } + if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { + if (tessedit_bigram_debug) { + tprintf("Skipping because one of the words is W_REP_CHAR\n"); + } + continue; + } + // Two words sharing the same language model, excellent! + if (w->alt_choices.empty()) { + if (tessedit_bigram_debug) { + tprintf("Alt choices not set up for word choice: %s\n", + w->best_choice->unichar_string().string()); + } + continue; + } + if (w_prev->alt_choices.empty()) { + if (tessedit_bigram_debug) { + tprintf("Alt choices not set up for word choice: %s\n", + w_prev->best_choice->unichar_string().string()); + } + continue; + } + + // We saved alternate choices, excellent! + GenericVector overrides_word1; + GenericVector *> overrides_word1_state; + GenericVector overrides_word2; + GenericVector *> overrides_word2_state; + + STRING orig_w1_str = w_prev->best_choice->unichar_string(); + STRING orig_w2_str = w->best_choice->unichar_string(); + WERD_CHOICE prev_best(w->uch_set); + { + int w1start, w1end; + w_prev->WithoutFootnoteSpan(&w1start, &w1end); + prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); + } + WERD_CHOICE this_best(w->uch_set); + { + int w2start, w2end; + w->WithoutFootnoteSpan(&w2start, &w2end); + this_best = w->best_choice->shallow_copy(w2start, w2end); + } + + if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { + if (tessedit_bigram_debug) { + tprintf("Top choice \"%s %s\" verified by bigram model.\n", + orig_w1_str.string(), orig_w2_str.string()); + } + continue; + } + if (tessedit_bigram_debug > 2) { + tprintf("Examining alt choices for \"%s %s\".\n", + orig_w1_str.string(), orig_w2_str.string()); + } + if (tessedit_bigram_debug > 1) { + if (w_prev->alt_choices.size() > 1) { + print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices, + false); + } + if (w->alt_choices.size() > 1) { + print_word_alternates_list(w->best_choice, &w->alt_choices, false); + } + } + float best_rating = 0.0; + int best_idx = 0; + for (int i = 0; i < w_prev->alt_choices.size(); i++) { + WERD_CHOICE *p1 = w_prev->alt_choices.get(i); + WERD_CHOICE strip1(w->uch_set); + { + int p1start, p1end; + w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i), + &p1start, &p1end); + strip1 = p1->shallow_copy(p1start, p1end); + } + for (int j = 0; j < w->alt_choices.size(); j++) { + WERD_CHOICE *p2 = w->alt_choices.get(j); + WERD_CHOICE strip2(w->uch_set); + { + int p2start, p2end; + w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end); + strip2 = p2->shallow_copy(p2start, p2end); + } + if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { + overrides_word1.push_back(p1); + overrides_word1_state.push_back(&w_prev->alt_states.get(i)); + overrides_word2.push_back(p2); + overrides_word2_state.push_back(&w->alt_states.get(j)); + if (overrides_word1.size() == 1 || + p1->rating() + p2->rating() < best_rating) { + best_rating = p1->rating() + p2->rating(); + best_idx = overrides_word1.size() - 1; + } + } + } + } + if (overrides_word1.size() >= 1) { + // Excellent, we have some bigram matches. + if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, + *overrides_word1[best_idx]) && + EqualIgnoringCaseAndTerminalPunct(*w->best_choice, + *overrides_word2[best_idx])) { + tprintf("Top choice \"%s %s\" verified (sans case) by bigram model.\n", + orig_w1_str.string(), orig_w2_str.string()); + continue; + } + STRING new_w1_str = overrides_word1[best_idx]->unichar_string(); + STRING new_w2_str = overrides_word2[best_idx]->unichar_string(); + if (new_w1_str != orig_w1_str) { + w_prev->ReplaceBestChoice(*overrides_word1[best_idx], + *overrides_word1_state[best_idx]); + } + if (new_w2_str != orig_w2_str) { + w->ReplaceBestChoice(*overrides_word2[best_idx], + *overrides_word2_state[best_idx]); + } + if (tessedit_bigram_debug > 0) { + STRING choices_description; + int num_bigram_choices + = overrides_word1.size() * overrides_word2.size(); + if (num_bigram_choices == 1) { + choices_description = "This was the unique bigram choice."; + } else { + if (tessedit_bigram_debug > 1) { + STRING bigrams_list; + const int kMaxChoicesToPrint = 20; + int num_choices_printed = 0; + for (int i = 0; i < overrides_word1.size() && + num_choices_printed < kMaxChoicesToPrint; i++) { + for (int j = 0; j < overrides_word2.size() && + num_choices_printed < kMaxChoicesToPrint; j++) { + if (i > 0 || j > 0) { bigrams_list += ", "; } + WERD_CHOICE *p1 = overrides_word1[i]; + WERD_CHOICE *p2 = overrides_word2[j]; + bigrams_list += + p1->unichar_string() + " " + p2->unichar_string(); + num_choices_printed++; + if (num_choices_printed == kMaxChoicesToPrint) { + bigrams_list += " ..."; + } + } + } + choices_description = "There were many choices: {"; + choices_description += bigrams_list; + choices_description += "}"; + } else { + choices_description.add_str_int("There were ", num_bigram_choices); + choices_description += " compatible bigrams."; + } + } + tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", + orig_w1_str.string(), orig_w2_str.string(), + new_w1_str.string(), new_w2_str.string(), + choices_description.string()); + } + } + } +} + +void Tesseract::rejection_passes(PAGE_RES* page_res, + ETEXT_DESC* monitor, + const TBOX* target_word_box, + const char* word_config) { + PAGE_RES_IT page_res_it(page_res); + // ****************** Pass 5 ******************* // Gather statistics on rejects. - page_res_it.restart_page(); - word_index = 0; + int word_index = 0; while (!tessedit_test_adaption && page_res_it.word() != NULL) { set_global_loc_code(LOC_MM_ADAPT); + WERD_RES* word = page_res_it.word(); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 95 + 5 * word_index / stats_.word_count; } - check_debug_pt(page_res_it.word(), 70); + if (word->rebuild_word == NULL) { + // Word was not processed by tesseract. + page_res_it.forward(); + continue; + } + check_debug_pt(word, 70); // changed by jetsoft // specific to its needs to extract one word when need if (target_word_box && - !ProcessTargetWord(page_res_it.word()->word->bounding_box(), + !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) { page_res_it.forward(); continue; @@ -377,41 +626,33 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, // end jetsoft page_res_it.rej_stat_word(); - chars_in_word = page_res_it.word()->reject_map.length(); - rejects_in_word = page_res_it.word()->reject_map.reject_count(); + int chars_in_word = word->reject_map.length(); + int rejects_in_word = word->reject_map.reject_count(); - blob_quality = word_blob_quality(page_res_it.word(), - page_res_it.row()->row); + int blob_quality = word_blob_quality(word, page_res_it.row()->row); stats_.doc_blob_quality += blob_quality; - outline_errs = word_outline_errs(page_res_it.word()); + int outline_errs = word_outline_errs(word); stats_.doc_outline_errs += outline_errs; - word_char_quality(page_res_it.word(), - page_res_it.row()->row, + inT16 all_char_quality; + inT16 accepted_all_char_quality; + word_char_quality(word, page_res_it.row()->row, &all_char_quality, &accepted_all_char_quality); stats_.doc_char_quality += all_char_quality; - uinT8 permuter_type = page_res_it.word()->best_choice->permuter(); + uinT8 permuter_type = word->best_choice->permuter(); if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) || (permuter_type == USER_DAWG_PERM)) { stats_.good_char_count += chars_in_word - rejects_in_word; stats_.doc_good_char_quality += accepted_all_char_quality; } - check_debug_pt(page_res_it.word(), 80); + check_debug_pt(word, 80); if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) - page_res_it.word()->reject_map.rej_word_bad_quality(); - check_debug_pt(page_res_it.word(), 90); + word->reject_map.rej_word_bad_quality(); + check_debug_pt(word, 90); page_res_it.forward(); } - // ****************** Pass 5 ******************* - // If cube is loaded and its combiner is present, run it. - if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { - PrepareForCubeOCR(); - mutable_splitter()->Clear(); - run_cube(page_res); - } - if (tessedit_debug_quality_metrics) { tprintf ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" @@ -445,29 +686,177 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, set_global_loc_code(LOC_DOC_BLK_REJ); quality_based_rejection(page_res_it, good_quality_doc); } +} - // ****************** Pass 7 ******************* - font_recognition_pass(page_res_it); - - // Write results pass. - set_global_loc_code(LOC_WRITE_RESULTS); - // This is now redundant, but retained commented so show how to obtain - // bounding boxes and style information. +void Tesseract::blamer_pass(PAGE_RES* page_res) { + if (!wordrec_run_blamer) return; + PAGE_RES_IT page_res_it(page_res); + for (page_res_it.restart_page(); page_res_it.word() != NULL; + page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + if (word->blamer_bundle == NULL) { + word->blamer_bundle = new BlamerBundle(); + word->blamer_bundle->incorrect_result_reason = IRR_PAGE_LAYOUT; + word->blamer_bundle->debug = word->blamer_bundle->IncorrectReason(); + word->blamer_bundle->debug += " to blame"; + } else if (word->blamer_bundle->incorrect_result_reason == + IRR_NO_TRUTH) { + word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth", + word->best_choice, wordrec_debug_blamer); + } else { + bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice, + word->blamer_bundle->truth_text); + IncorrectResultReason irr = + word->blamer_bundle->incorrect_result_reason; + if (irr == IRR_CORRECT && !correct) { + STRING debug = "Choice is incorrect after recognition"; + word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug, + word->best_choice, + wordrec_debug_blamer); + } else if (irr != IRR_CORRECT && correct) { + if (wordrec_debug_blamer) { + tprintf("Corrected %s\n", word->blamer_bundle->debug.string()); + } + word->blamer_bundle->incorrect_result_reason = IRR_CORRECT; + word->blamer_bundle->debug = ""; + } + } + page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason]++; + } + tprintf("Blame reasons:\n"); + for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { + tprintf("%s %d\n", BlamerBundle::IncorrectReasonName( + static_cast(bl)), + page_res->blame_reasons[bl]); + } + if (page_res->misadaption_log.length() > 0) { + tprintf("Misadaption log:\n"); + for (int i = 0; i < page_res->misadaption_log.length(); ++i) { + tprintf("%s\n", page_res->misadaption_log[i].string()); + } + } +} - // changed by jetsoft - // needed for dll to output memory structure - if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) - output_pass(page_res_it, target_word_box); - // end jetsoft - PageSegMode pageseg_mode = static_cast( - static_cast(tessedit_pageseg_mode)); - textord_.CleanupSingleRowResult(pageseg_mode, page_res); +// Helper returns true if the new_word is better than the word, using a +// simple test of better certainty AND rating (to reduce false positives +// from cube) or a dictionary vs non-dictionary word. +static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) { + if (new_word.best_choice == NULL) { + return false; // New one no good. + } + if (word.best_choice == NULL) { + return true; // Old one no good. + } + if (new_word.best_choice->certainty() > word.best_choice->certainty() && + new_word.best_choice->rating() < word.best_choice->rating()) { + return true; // New word has better confidence. + } + if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) && + Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) { + return true; // New word is from a dictionary. + } + return false; // New word is no better. +} - if (monitor != NULL) { - monitor->progress = 100; +// Helper to recognize the word using the given (language-specific) tesseract. +// Returns true if the result was better than previously. +bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row, + WordRecognizer recognizer) { + if (classify_debug_level || cube_debug_level) { + tprintf("Retrying word using lang %s, oem %d\n", + lang.string(), static_cast(tessedit_ocr_engine_mode)); } + // Setup a trial WERD_RES in which to classify. + WERD_RES lang_word; + lang_word.InitForRetryRecognition(*word); + // Run the recognizer on the word. + // Initial version is a bit of a hack based on better certainty and rating + // (to reduce false positives from cube) or a dictionary vs non-dictionary + // word. + (this->*recognizer)(block, row, &lang_word); + bool new_is_better = NewWordBetter(*word, lang_word); + if (classify_debug_level || cube_debug_level) { + if (lang_word.best_choice == NULL) { + tprintf("New result %s better:%s\n", + new_is_better ? "IS" : "NOT"); + } else { + tprintf("New result %s better:%s, r=%g, c=%g\n", + new_is_better ? "IS" : "NOT", + lang_word.best_choice->unichar_string().string(), + lang_word.best_choice->rating(), + lang_word.best_choice->certainty()); + } + } + if (new_is_better) { + word->ConsumeWordResults(&lang_word); + } + return new_is_better; } +// Generic function for classifying a word. Can be used either for pass1 or +// pass2 according to the function passed to recognizer. +// word block and row are the current location in the document's PAGE_RES. +// Recognizes in the current language, and if successful that is all. +// If recognition was not successful, tries all available languages until +// it gets a successful result or runs out of languages. Keeps the best result. +void Tesseract::classify_word_and_language(WordRecognizer recognizer, + BLOCK* block, + ROW *row, + WERD_RES *word) { + if (classify_debug_level || cube_debug_level) { + tprintf("Processing word with lang %s at:", + most_recently_used_->lang.string()); + word->word->bounding_box().print(); + } + const char* result_type = "Initial"; + bool initially_done = !word->tess_failed && word->done; + if (initially_done) { + // If done on pass1, we reuse the tesseract that did it, and don't try + // any more. The only need to call the classifier at all is for the + // cube combiner and xheight fixing (which may be bogus on a done word.) + most_recently_used_ = word->tesseract; + result_type = "Already done"; + } + (most_recently_used_->*recognizer)(block, row, word); + if (!word->tess_failed && word->tess_accepted) + result_type = "Accepted"; + if (classify_debug_level || cube_debug_level) { + tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n", + result_type, + word->best_choice->unichar_string().string(), + word->best_choice->rating(), + word->best_choice->certainty(), + word->tess_accepted, word->tess_would_adapt); + } + if (word->tess_failed || !word->tess_accepted) { + // Try all the other languages to see if they are any better. + Tesseract* previous_used = most_recently_used_; + if (most_recently_used_ != this) { + if (classify_debug_level) { + tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string()); + } + if (RetryWithLanguage(word, block, row, recognizer)) { + most_recently_used_ = this; + if (!word->tess_failed && word->tess_accepted) + return; // No need to look at the others. + } + } + + for (int i = 0; i < sub_langs_.size(); ++i) { + if (sub_langs_[i] != previous_used) { + if (classify_debug_level) { + tprintf("Retrying with sub-Tesseract[%d] lang: %s\n", + i, sub_langs_[i]->lang.string()); + } + if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) { + most_recently_used_ = sub_langs_[i]; + if (!word->tess_failed && word->tess_accepted) + return; // No need to look at the others. + } + } + } + } +} /** * classify_word_pass1 @@ -475,9 +864,13 @@ void Tesseract::recog_all_words(PAGE_RES* page_res, * Baseline normalize the word and pass it to Tess. */ -void Tesseract::classify_word_pass1(WERD_RES *word, // word to do - ROW *row, - BLOCK* block) { +void Tesseract::classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) { + // If we only intend to run cube - run it and return. + if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { + cube_word_pass1(block, row, word); + return; + } + BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST(); BOOL8 adapt_ok; const char *rejmap; @@ -485,8 +878,10 @@ void Tesseract::classify_word_pass1(WERD_RES *word, // word to do STRING mapstr = ""; check_debug_pt(word, 0); - if (word->SetupForRecognition(unicharset, classify_bln_numeric_mode, - row, block)) + if (word->SetupForTessRecognition(unicharset, this, BestPix(), + classify_bln_numeric_mode, + this->textord_use_cjk_fp_model, + row, block)) tess_segment_pass1(word, blob_choices); if (!word->tess_failed) { /* @@ -502,12 +897,12 @@ void Tesseract::classify_word_pass1(WERD_RES *word, // word to do if (!word->word->flag(W_REP_CHAR)) { // TODO(daria) delete these hacks when replaced by more generic code. // Convert '' (double single) to " (single double). - fix_quotes(word, blob_choices); + word->fix_quotes(blob_choices); if (tessedit_fix_hyphens) // turn -- to - - fix_hyphens(word, blob_choices); + word->fix_hyphens(blob_choices); word->tess_accepted = tess_acceptable_word(word->best_choice, - word->raw_choice); + word->raw_choice); word->tess_would_adapt = word->best_choice && word->raw_choice && AdaptableWord(word->rebuild_word, @@ -534,9 +929,24 @@ void Tesseract::classify_word_pass1(WERD_RES *word, // word to do rejmap = mapstr.string(); } // Send word to adaptive classifier for training. - word->BestChoiceToCorrectText(unicharset); + word->BestChoiceToCorrectText(); set_word_fonts(word, blob_choices); LearnWord(NULL, rejmap, word); + // Mark misadaptions if running blamer. + if (word->blamer_bundle != NULL && + word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH && + !ChoiceIsCorrect(*word->uch_set, word->best_choice, + word->blamer_bundle->truth_text)) { + word->blamer_bundle->misadaption_debug ="misadapt to word ("; + word->blamer_bundle->misadaption_debug += + word->best_choice->permuter_name(); + word->blamer_bundle->misadaption_debug += "): "; + word->blamer_bundle->FillDebugString( + "", word->best_choice, &(word->blamer_bundle->misadaption_debug)); + if (wordrec_debug_blamer) { + tprintf("%s\n", word->blamer_bundle->misadaption_debug.string()); + } + } } if (tessedit_enable_doc_dict) @@ -548,52 +958,16 @@ void Tesseract::classify_word_pass1(WERD_RES *word, // word to do word->best_choice->set_blob_choices(blob_choices); } -// Helper to switch between the original and new xht word or to discard -// the new xht word, according to accept_new_word. -static void SwitchWordOrDiscard(bool accept_new_word, WERD_RES* word, - WERD_RES* new_word) { - if (accept_new_word) { - // The new_word is deemed superior so put the final results in the real - // word and destroy the old results. - word->denorm = new_word->denorm; - delete word->chopped_word; - word->chopped_word = new_word->chopped_word; - new_word->chopped_word = NULL; - delete word->rebuild_word; - word->rebuild_word = new_word->rebuild_word; - new_word->rebuild_word = NULL; - delete word->box_word; - word->box_word = new_word->box_word; - new_word->box_word = NULL; - free_seam_list(word->seam_array); - word->seam_array = new_word->seam_array; - new_word->seam_array = NULL; - word->best_state.move(&new_word->best_state); - word->correct_text.move(&new_word->correct_text); - delete word->best_choice; - word->best_choice = new_word->best_choice; - new_word->best_choice = NULL; - delete word->raw_choice; - word->raw_choice = new_word->raw_choice; - new_word->raw_choice = NULL; - word->reject_map = new_word->reject_map; - word->CopySimpleFields(*new_word); - } else { - // The new_word is no better, so destroy it and cleanup. - new_word->ClearResults(); - } -} - // Helper to report the result of the xheight fix. void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES* word, WERD_RES* new_word) { tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().string(), - word->best_choice->debug_string(unicharset).string()); + word->best_choice->debug_string().string()); word->reject_map.print(debug_fp); tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().string(), - new_word->best_choice->debug_string(unicharset).string()); + new_word->best_choice->debug_string().string()); new_word->reject_map.print(debug_fp); tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT", @@ -614,6 +988,10 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) { float new_x_ht = ComputeCompatibleXheight(word); if (new_x_ht > 0.0f) { WERD_RES new_x_ht_word(word->word); + if (word->blamer_bundle != NULL) { + new_x_ht_word.blamer_bundle = new BlamerBundle(); + new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); + } new_x_ht_word.x_height = new_x_ht; new_x_ht_word.caps_height = 0.0; match_word_pass2(&new_x_ht_word, row, block); @@ -638,9 +1016,10 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) { ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); } } - SwitchWordOrDiscard(accept_new_x_ht, word, &new_x_ht_word); - if (accept_new_x_ht) + if (accept_new_x_ht) { + word->ConsumeWordResults(&new_x_ht_word); return true; + } } return false; } @@ -651,7 +1030,12 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) { * Control what to do with the word in pass 2 */ -void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) { +void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) { + // Return if we do not want to run Tesseract. + if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && + tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED) + return; + bool done_this_pass = false; set_global_subloc_code(SUBLOC_NORM); check_debug_pt(word, 30); @@ -691,12 +1075,12 @@ void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) { if (num_upper > 0 && num_lower == 0) word->small_caps = true; } - word->SetScriptPositions(unicharset); + word->SetScriptPositions(); set_global_subloc_code(SUBLOC_NORM); } #ifndef GRAPHICS_DISABLED - if (tessedit_draw_outwords) { + if (tessedit_display_outwords) { if (fx_win == NULL) create_fx_win(); clear_fx_win(); @@ -707,7 +1091,6 @@ void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) { ScrollView::Update(); } #endif - set_global_subloc_code(SUBLOC_NORM); check_debug_pt(word, 50); } @@ -724,21 +1107,23 @@ void Tesseract::match_word_pass2(WERD_RES *word, //word to do BLOCK* block) { BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST(); - if (word->SetupForRecognition(unicharset, classify_bln_numeric_mode, - row, block)) + if (word->SetupForTessRecognition(unicharset, this, BestPix(), + classify_bln_numeric_mode, + this->textord_use_cjk_fp_model, + row, block)) tess_segment_pass2(word, blob_choices); if (!word->tess_failed) { if (!word->word->flag (W_REP_CHAR)) { - fix_quotes(word, blob_choices); + word->fix_quotes(blob_choices); if (tessedit_fix_hyphens) - fix_hyphens(word, blob_choices); + word->fix_hyphens(blob_choices); /* Dont trust fix_quotes! - though I think I've fixed the bug */ if (word->best_choice->length() != word->box_word->length() || word->best_choice->length() != blob_choices->length()) { tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" " #Blobs=%d; #Choices=%d\n", - word->best_choice->debug_string(unicharset).string(), + word->best_choice->debug_string().string(), word->best_choice->length(), word->box_word->length(), blob_choices->length()); @@ -752,6 +1137,7 @@ void Tesseract::match_word_pass2(WERD_RES *word, //word to do // Save best choices in the WERD_CHOICE if needed word->best_choice->set_blob_choices(blob_choices); + set_word_fonts(word, blob_choices); assert (word->raw_choice != NULL); } @@ -823,7 +1209,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) { const WERD_CHOICE &word = *(word_res->best_choice); // Find the frequency of each unique character in the word. - UNICHAR_ID space = unicharset.unichar_to_id(" "); + UNICHAR_ID space = word_res->uch_set->unichar_to_id(" "); SortHelper rep_ch(word.length()); for (int i = 0; i < word.length(); ++i) { if (word.unichar_id(i) != space) @@ -837,7 +1223,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) { BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res); if (best_choice == NULL) { tprintf("Failed to find a choice for %s, occurring %d times\n", - unicharset.debug_str(maxch_id).string(), max_count); + word_res->uch_set->debug_str(maxch_id).string(), max_count); return; } word_res->done = TRUE; @@ -862,7 +1248,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) { } else { // Just correct existing classification. CorrectRepcharChoices(best_choice, word_res); - word_res->best_choice->populate_unichars(unicharset); + word_res->best_choice->populate_unichars(); word_res->reject_map.initialise(word.length()); } } @@ -884,109 +1270,27 @@ void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice, bool last_blob = blob_it.at_last(); WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob, blob_it.extract()); - WERD_RES* rep_word = page_res_it->InsertCloneWord(*word_res, blob_word); + // Note that blamer_bundle (truth information) is not copied, which is + // desirable, since the newly inserted words would not have the original + // bounding box corresponding to the one recorded in truth fields. + WERD_RES* rep_word = + page_res_it->InsertSimpleCloneWord(*word_res, blob_word); // Setup the single char WERD_RES - rep_word->SetupForRecognition(unicharset, false, page_res_it->row()->row, - page_res_it->block()->block); - rep_word->CloneChoppedToRebuild(); - BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice); - rep_word->FakeClassifyWord(unicharset, 1, &blob_choice); + if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(), + false, + this->textord_use_cjk_fp_model, + page_res_it->row()->row, + page_res_it->block()->block)) { + rep_word->CloneChoppedToRebuild(); + BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice); + rep_word->FakeClassifyWord(1, &blob_choice); + } } page_res_it->DeleteCurrentWord(); } -// TODO(tkielbus) Decide between keeping this behavior here or modifying the -// training data. - -// Utility function for fix_quotes -// Return true if the next character in the string (given the UTF8 length in -// bytes) is a quote character. -static int is_simple_quote(const char* signed_str, int length) { - const unsigned char* str = - reinterpret_cast(signed_str); - //standard 1 byte quotes - return (length == 1 && (*str == '\'' || *str == '`')) || - //utf8 3 bytes curved quotes - (length == 3 && ((*str == 0xe2 && - *(str + 1) == 0x80 && - *(str + 2) == 0x98) || - (*str == 0xe2 && - *(str + 1) == 0x80 && - *(str + 2) == 0x99))); -} - -// Callback helper for fix_quotes returns a double quote if both -// arguments are quote, otherwise INVALID_UNICHAR_ID. -UNICHAR_ID Tesseract::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) { - const char *ch = unicharset.id_to_unichar(id1); - const char *next_ch = unicharset.id_to_unichar(id2); - if (is_simple_quote(ch, strlen(ch)) && - is_simple_quote(next_ch, strlen(next_ch))) - return unicharset.unichar_to_id("\""); - return INVALID_UNICHAR_ID; -} - -/** - * fix_quotes - * - * Change pairs of quotes to double quotes. - */ -void Tesseract::fix_quotes(WERD_RES* word_res, - BLOB_CHOICE_LIST_CLIST* blob_choices) { - if (!unicharset.contains_unichar("\"") || - !unicharset.get_enabled(unicharset.unichar_to_id("\""))) - return; // Don't create it if it is disallowed. - - word_res->ConditionalBlobMerge( - unicharset, - NewPermanentTessCallback(this, &Tesseract::BothQuotes), - NULL, - blob_choices); -} - -// Callback helper for fix_hyphens returns UNICHAR_ID of - if both -// arguments are hyphen, otherwise INVALID_UNICHAR_ID. -UNICHAR_ID Tesseract::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) { - const char *ch = unicharset.id_to_unichar(id1); - const char *next_ch = unicharset.id_to_unichar(id2); - if (strlen(ch) == 1 && strlen(next_ch) == 1 && - (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~')) - return unicharset.unichar_to_id("-"); - return INVALID_UNICHAR_ID; -} - -// Callback helper for fix_hyphens returns true if box1 and box2 overlap -// (assuming both on the same textline, are in order and a chopped em dash.) -bool Tesseract::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) { - return box1.right() >= box2.left(); -} - -/** - * fix_hyphens - * - * Change pairs of hyphens to a single hyphen if the bounding boxes touch - * Typically a long dash which has been segmented. - */ -void Tesseract::fix_hyphens(WERD_RES *word_res, - BLOB_CHOICE_LIST_CLIST *blob_choices) { - if (!unicharset.contains_unichar("-") || - !unicharset.get_enabled(unicharset.unichar_to_id("-"))) - return; // Don't create it if it is disallowed. - - word_res->ConditionalBlobMerge( - unicharset, - NewPermanentTessCallback(this, &Tesseract::BothHyphens), - NewPermanentTessCallback(this, &Tesseract::HyphenBoxesOverlap), - blob_choices); -} -} // namespace tesseract - - - -namespace tesseract { - -ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s, - const char *lengths) { +ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string( + const UNICHARSET& char_set, const char *s, const char *lengths) { int i = 0; int offset = 0; int leading_punct_count; @@ -999,22 +1303,20 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s, /* Single Leading punctuation char*/ - if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset]))) + if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset])) offset += lengths[i++]; leading_punct_count = i; /* Initial cap */ - while ((s[offset] != '\0') && - unicharset.get_isupper(s + offset, lengths[i])) { + while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { offset += lengths[i++]; upper_count++; } - if (upper_count > 1) + if (upper_count > 1) { word_type = AC_UPPER_CASE; - else { + } else { /* Lower case word, possibly with an initial cap */ - while ((s[offset] != '\0') && - unicharset.get_islower (s + offset, lengths[i])) { + while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { offset += lengths[i++]; } if (i - leading_punct_count < quality_min_initial_alphas_reqd) @@ -1028,14 +1330,13 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s, offset += lengths[i++]; if (s[offset] != '\0') { while ((s[offset] != '\0') && - unicharset.get_islower(s + offset, lengths[i])) { + char_set.get_islower(s + offset, lengths[i])) { offset += lengths[i++]; } if (i < hyphen_pos + 3) goto not_a_word; } - } - else { + } else { /* Allow "'s" in NON hyphenated lower case words */ if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) { @@ -1050,12 +1351,12 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s, } /* Up to two different, constrained trailing punctuation chars */ - if (lengths[i] == 1 && (s[offset] != '\0') && - (STRING (chs_trailing_punct1).contains (s[offset]))) + if (lengths[i] == 1 && s[offset] != '\0' && + STRING(chs_trailing_punct1).contains(s[offset])) offset += lengths[i++]; - if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 && - (s[offset - lengths[i - 1]] != s[offset]) && - (STRING (chs_trailing_punct2).contains (s[offset]))) + if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && + s[offset - lengths[i - 1]] != s[offset] && + STRING(chs_trailing_punct2).contains (s[offset])) offset += lengths[i++]; if (s[offset] != '\0') @@ -1067,20 +1368,20 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s, /* Look for abbreviation string */ i = 0; offset = 0; - if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) { + if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { word_type = AC_UC_ABBREV; - while ((s[offset] != '\0') && - unicharset.get_isupper(s + offset, lengths[i]) && - (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) { + while (s[offset] != '\0' && + char_set.get_isupper(s + offset, lengths[i]) && + lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { offset += lengths[i++]; offset += lengths[i++]; } } - else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) { + else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { word_type = AC_LC_ABBREV; - while ((s[offset] != '\0') && - unicharset.get_islower(s + offset, lengths[i]) && - (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) { + while (s[offset] != '\0' && + char_set.get_islower(s + offset, lengths[i]) && + lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { offset += lengths[i++]; offset += lengths[i++]; } @@ -1229,12 +1530,10 @@ void Tesseract::set_word_fonts(WERD_RES *word, for (char_it.mark_cycle_pt(), index = 0; !char_it.cycled_list(); ++index, char_it.forward()) { UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index); - if (word_ch_id >= PreTrainedTemplates->NumClasses) - return; // This must be a cube word. choice_it.set_to_list(char_it.data()); if (tessedit_debug_fonts) { - tprintf("Examining fonts in %s\n", word->best_choice->debug_string( - getDict().getUnicharset()).string()); + tprintf("Examining fonts in %s\n", + word->best_choice->debug_string().string()); } for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { @@ -1242,7 +1541,7 @@ void Tesseract::set_word_fonts(WERD_RES *word, if (blob_ch_id == word_ch_id) { if (tessedit_debug_fonts) { tprintf("%s font %s (%d) font2 %s (%d)\n", - getDict().getUnicharset().id_to_unichar(blob_ch_id), + word->uch_set->id_to_unichar(blob_ch_id), choice_it.data()->fontinfo_id() < 0 ? "unknown" : fontinfo_table_.get(choice_it.data()->fontinfo_id()).name, choice_it.data()->fontinfo_id(), @@ -1261,19 +1560,22 @@ void Tesseract::set_word_fonts(WERD_RES *word, } } } - find_modal_font(&fonts, &word->fontinfo_id, &word->fontinfo_id_count); - find_modal_font(&fonts, &word->fontinfo_id2, &word->fontinfo_id2_count); + inT16 font_id1, font_id2; + find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count); + find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count); + word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL; + word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL; // All the blobs get the word's best choice font. for (int i = 0; i < word->best_choice->length(); ++i) { - word->best_choice_fontinfo_ids.push_back(word->fontinfo_id); + word->best_choice_fontinfo_ids.push_back(font_id1); } if (word->fontinfo_id_count > 0) { - FontInfo fi = fontinfo_table_.get(word->fontinfo_id); + FontInfo fi = fontinfo_table_.get(font_id1); if (tessedit_debug_fonts) { if (word->fontinfo_id2_count > 0) { tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name, word->fontinfo_id_count, - fontinfo_table_.get(word->fontinfo_id2).name, + fontinfo_table_.get(font_id2).name, word->fontinfo_id2_count); } else { tprintf("Word modal font=%s, score=%d. No 2nd choice\n", @@ -1293,47 +1595,58 @@ void Tesseract::set_word_fonts(WERD_RES *word, * Smooth the fonts for the document. */ -void Tesseract::font_recognition_pass( //good chars in word - PAGE_RES_IT &page_res_it) { - inT32 length; //of word - inT32 count; //of a feature - inT16 doc_font; //modal font - inT8 doc_font_count; //modal font - WERD_RES *word; //current word - STATS doc_fonts (0, get_fontinfo_table().size() ? - get_fontinfo_table().size() : 32); // font counters +void Tesseract::font_recognition_pass(PAGE_RES* page_res) { + PAGE_RES_IT page_res_it(page_res); + WERD_RES *word; // current word + STATS doc_fonts(0, font_table_size_); // font counters - page_res_it.restart_page(); - while (page_res_it.word() != NULL) { + // Gather font id statistics. + for (page_res_it.restart_page(); page_res_it.word() != NULL; + page_res_it.forward()) { word = page_res_it.word(); - set_word_fonts(word, word->best_choice->blob_choices()); - if (!save_best_choices) { // set_blob_choices() does a deep clear - word->best_choice->set_blob_choices(NULL); + if (word->fontinfo != NULL) { + doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); + } + if (word->fontinfo2 != NULL) { + doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); } - doc_fonts.add(word->fontinfo_id, word->fontinfo_id_count); - doc_fonts.add(word->fontinfo_id2, word->fontinfo_id2_count); - page_res_it.forward(); } + inT16 doc_font; // modal font + inT8 doc_font_count; // modal font find_modal_font(&doc_fonts, &doc_font, &doc_font_count); if (doc_font_count == 0) return; - FontInfo fi = fontinfo_table_.get(doc_font); + // Get the modal font pointer. + const FontInfo* modal_font = NULL; + for (page_res_it.restart_page(); page_res_it.word() != NULL; + page_res_it.forward()) { + word = page_res_it.word(); + if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) { + modal_font = word->fontinfo; + break; + } + if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) { + modal_font = word->fontinfo2; + break; + } + } + ASSERT_HOST(modal_font != NULL); - page_res_it.restart_page (); - while (page_res_it.word () != NULL) { - word = page_res_it.word (); - length = word->best_choice->length(); + // Assign modal font to weak words. + for (page_res_it.restart_page(); page_res_it.word() != NULL; + page_res_it.forward()) { + word = page_res_it.word(); + int length = word->best_choice->length(); // 1st choices got 2 pts, so we need to halve the score for the mode. - count = (word->fontinfo_id_count + 1) / 2; + int count = (word->fontinfo_id_count + 1) / 2; if (!(count == length || (length > 3 && count >= length * 3 / 4))) { - word->fontinfo_id = doc_font; + word->fontinfo = modal_font; // Counts only get 1 as it came from the doc. word->fontinfo_id_count = 1; - word->italic = fi.is_italic() ? 1 : -1; - word->bold = fi.is_bold() ? 1 : -1; + word->italic = modal_font->is_italic() ? 1 : -1; + word->bold = modal_font->is_bold() ? 1 : -1; } - page_res_it.forward(); } } diff --git a/ccmain/cube_control.cpp b/ccmain/cube_control.cpp index 305581a4c8..cd5cc74191 100644 --- a/ccmain/cube_control.cpp +++ b/ccmain/cube_control.cpp @@ -157,13 +157,16 @@ static WERD_CHOICE *create_werd_choice( CharSet* cube_char_set ) { // Insert unichar ids into WERD_CHOICE - WERD_CHOICE *werd_choice = new WERD_CHOICE(num_chars); + WERD_CHOICE *werd_choice = new WERD_CHOICE(&unicharset, num_chars); + // within a word, cube recognizes the word in reading order. + werd_choice->set_unichars_in_script_order(true); ASSERT_HOST(werd_choice != NULL); UNICHAR_ID uch_id; for (int i = 0; i < num_chars; ++i) { uch_id = cube_char_set->UnicharID(char_samples[i]->StrLabel()); if (uch_id != INVALID_UNICHAR_ID) - werd_choice->append_unichar_id_space_allocated(uch_id, 1, 0.0, certainty); + werd_choice->append_unichar_id_space_allocated( + uch_id, 1, 0.0, certainty); } BLOB_CHOICE *blob_choice; @@ -179,12 +182,12 @@ static WERD_CHOICE *create_werd_choice( choices_list_it.set_to_list(choices_list); // Add a single BLOB_CHOICE to the list blob_choice = new BLOB_CHOICE(werd_choice->unichar_id(i), - 0.0, certainty, -1, -1, 0); + 0.0, certainty, -1, -1, 0, 0, 0, false); choices_list_it.add_after_then_move(blob_choice); // Add list to the clist blob_choices_it.add_to_end(choices_list); } - werd_choice->populate_unichars(unicharset); + werd_choice->populate_unichars(); werd_choice->set_certainty(certainty); werd_choice->set_blob_choices(blob_choices); return werd_choice; @@ -231,115 +234,153 @@ bool Tesseract::init_cube_objects(bool load_combiner, } /********************************************************************** - * run_cube + * run_cube_combiner * - * Iterate through tesseract's results and call cube on each word. - * If the combiner is present, optionally run the tesseract-cube - * combiner on each word. + * Iterates through tesseract's results and calls cube on each word, + * combining the results with the existing tesseract result. **********************************************************************/ -void Tesseract::run_cube( - PAGE_RES *page_res // page structure - ) { - ASSERT_HOST(cube_cntxt_ != NULL); - if (!pix_binary_) { - if (cube_debug_level > 0) - tprintf("Tesseract::run_cube(): NULL binary image.\n"); - return; - } - if (!page_res) +void Tesseract::run_cube_combiner(PAGE_RES *page_res) { + if (page_res == NULL || tess_cube_combiner_ == NULL) return; PAGE_RES_IT page_res_it(page_res); - page_res_it.restart_page(); - // Iterate through the word results and call cube on each word. - CubeObject *cube_obj; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES* word = page_res_it.word(); - TBOX word_box = word->word->bounding_box(); - // TODO(rays): Instead of page_res_it.block()->block maybe use - // word->denorm.block() once TODO in - // Tesseract::recog_all_words() is addressed. - const BLOCK* block = page_res_it.block()->block; - if (block != NULL && (block->re_rotation().x() != 1.0f || - block->re_rotation().y() != 0.0f)) { - // TODO(rays) We have to rotate the bounding box to get the true coords. - // This will be achieved in the future via DENORM. - // In the mean time, cube can't process this word. - if (cube_debug_level > 0) { - tprintf("Cube can't process rotated word at:"); - word_box.print(); - } - if (word->best_choice == NULL) - page_res_it.DeleteCurrentWord(); // Nobody has an answer. + // Skip cube entirely if tesseract's certainty is greater than threshold. + int combiner_run_thresh = convert_prob_to_tess_certainty( + cube_cntxt_->Params()->CombinerRunThresh()); + if (word->best_choice->certainty() >= combiner_run_thresh) { continue; } - cube_obj = new tesseract::CubeObject(cube_cntxt_, pix_binary_, - word_box.left(), - pix_binary_->h - word_box.top(), - word_box.width(), word_box.height()); - cube_recognize(cube_obj, &page_res_it); + // Use the same language as Tesseract used for the word. + Tesseract* lang_tess = word->tesseract; + + // Setup a trial WERD_RES in which to classify with cube. + WERD_RES cube_word; + cube_word.InitForRetryRecognition(*word); + CubeObject *cube_obj = lang_tess->cube_recognize_word( + page_res_it.block()->block, &cube_word); + if (cube_obj != NULL) + lang_tess->cube_combine_word(cube_obj, &cube_word, word); delete cube_obj; } } /********************************************************************** - * cube_recognize + * cube_word_pass1 * - * Call cube on the current word, optionally run the tess-cube combiner, and - * modify the tesseract result if cube wins. If cube fails to run, or - * if tesseract wins, leave the tesseract result unchanged. If the - * combiner is not instantiated, always use cube's result. + * Recognizes a single word using (only) cube. Compatible with + * Tesseract's classify_word_pass1/classify_word_pass2. + **********************************************************************/ +void Tesseract::cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) { + CubeObject *cube_obj = cube_recognize_word(block, word); + delete cube_obj; +} + +/********************************************************************** + * cube_recognize_word * + * Cube recognizer to recognize a single word as with classify_word_pass1 + * but also returns the cube object in case the combiner is needed. **********************************************************************/ -void Tesseract::cube_recognize( - CubeObject *cube_obj, - PAGE_RES_IT *page_res_it - ) { - // Retrieve tesseract's data structure for the current word. - WERD_RES *tess_werd_res = page_res_it->word(); - if (!tess_werd_res->best_choice && tess_cube_combiner_ != NULL) { - if (cube_debug_level > 0) - tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot run combiner " - "without a tess result.\n"); - return; +CubeObject* Tesseract::cube_recognize_word(BLOCK* block, WERD_RES* word) { + if (!cube_binary_ || !cube_cntxt_) { + if (cube_debug_level > 0 && !cube_binary_) + tprintf("Tesseract::run_cube(): NULL binary image.\n"); + word->SetupFake(unicharset); + return NULL; } + TBOX word_box = word->word->bounding_box(); + if (block != NULL && (block->re_rotation().x() != 1.0f || + block->re_rotation().y() != 0.0f)) { + // TODO(rays) We have to rotate the bounding box to get the true coords. + // This will be achieved in the future via DENORM. + // In the mean time, cube can't process this word. + if (cube_debug_level > 0) { + tprintf("Cube can't process rotated word at:"); + word_box.print(); + } + word->SetupFake(unicharset); + return NULL; + } + CubeObject* cube_obj = new tesseract::CubeObject( + cube_cntxt_, cube_binary_, word_box.left(), + pixGetHeight(cube_binary_) - word_box.top(), + word_box.width(), word_box.height()); + if (!cube_recognize(cube_obj, block, word)) { + delete cube_obj; + return NULL; + } + return cube_obj; +} - // Skip cube entirely if combiner is present but tesseract's - // certainty is greater than threshold. - int combiner_run_thresh = convert_prob_to_tess_certainty( - cube_cntxt_->Params()->CombinerRunThresh()); - if (tess_cube_combiner_ != NULL && - (tess_werd_res->best_choice->certainty() >= combiner_run_thresh)) { +/********************************************************************** + * cube_combine_word + * + * Combines the cube and tesseract results for a single word, leaving the + * result in tess_word. + **********************************************************************/ +void Tesseract::cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word, + WERD_RES* tess_word) { + float combiner_prob = tess_cube_combiner_->CombineResults(tess_word, + cube_obj); + // If combiner probability is greater than tess/cube combiner + // classifier threshold, i.e. tesseract wins, then just return the + // tesseract result unchanged, as the combiner knows nothing about how + // correct the answer is. If cube and tesseract agree, then improve the + // scores before returning. + WERD_CHOICE* tess_best = tess_word->best_choice; + WERD_CHOICE* cube_best = cube_word->best_choice; + if (cube_debug_level || classify_debug_level) { + tprintf("Combiner prob = %g vs threshold %g\n", + combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh()); + } + if (combiner_prob >= + cube_cntxt_->Params()->CombinerClassifierThresh()) { + if (tess_best->unichar_string() == cube_best->unichar_string()) { + // Cube and tess agree, so improve the scores. + tess_best->set_rating(tess_best->rating() / 2); + tess_best->set_certainty(tess_best->certainty() / 2); + } return; } + // Cube wins. + // It is better for the language combiner to have all tesseract scores, + // so put them in the cube result. + cube_best->set_rating(tess_best->rating()); + cube_best->set_certainty(tess_best->certainty()); + if (cube_debug_level || classify_debug_level) { + tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n", + tess_best->unichar_string().string(), + cube_best->unichar_string().string()); + } + tess_word->ConsumeWordResults(cube_word); +} + +/********************************************************************** + * cube_recognize + * + * Call cube on the current word, and write the result to word. + * Sets up a fake result and returns false if something goes wrong. + **********************************************************************/ +bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block, + WERD_RES *word) { + if (!word->SetupForCubeRecognition(unicharset, this, block)) { + return false; // Graphics block. + } // Run cube WordAltList *cube_alt_list = cube_obj->RecognizeWord(); if (!cube_alt_list || cube_alt_list->AltCount() <= 0) { if (cube_debug_level > 0) { tprintf("Cube returned nothing for word at:"); - tess_werd_res->word->bounding_box().print(); - } - if (tess_werd_res->best_choice == NULL) { - // Nobody has recognized it, so pretend it doesn't exist. - if (cube_debug_level > 0) { - tprintf("Deleted word not recognized by cube and/or tesseract at:"); - tess_werd_res->word->bounding_box().print(); - } - page_res_it->DeleteCurrentWord(); + word->word->bounding_box().print(); } - return; + word->SetupFake(unicharset); + return false; } - // At this point we *could* run the combiner and bail out if - // Tesseract wins, but that would require instantiating a new - // CubeObject to avoid losing the original recognition results - // (e.g., beam search lattice) stored with the CubeObject. Instead, - // we first extract the state we need from the current recognition - // and then reuse the CubeObject so that the combiner does not need - // to recompute the image's connected components, segmentation, etc. - // Get cube's best result and its probability, mapped to tesseract's // certainty range char_32 *cube_best_32 = cube_alt_list->Alt(0); @@ -357,14 +398,15 @@ void Tesseract::cube_recognize( && cube_debug_level > 0) { tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract " "cube state.\n"); - return; + word->SetupFake(unicharset); + return false; } // Convert cube's character bounding boxes to a BoxWord. BoxWord cube_box_word; - TBOX tess_word_box = tess_werd_res->word->bounding_box(); - if (tess_werd_res->denorm.block() != NULL) - tess_word_box.rotate(tess_werd_res->denorm.block()->re_rotation()); + TBOX tess_word_box = word->word->bounding_box(); + if (word->denorm.block() != NULL) + tess_word_box.rotate(word->denorm.block()->re_rotation()); bool box_word_success = create_cube_box_word(char_boxes, num_chars, tess_word_box, &cube_box_word); @@ -374,7 +416,8 @@ void Tesseract::cube_recognize( tprintf("Cube WARNING (Tesseract::cube_recognize): Could not " "create cube BoxWord\n"); } - return; + word->SetupFake(unicharset); + return false; } // Create cube's best choice. @@ -388,36 +431,19 @@ void Tesseract::cube_recognize( tprintf("Cube WARNING (Tesseract::cube_recognize): Could not " "create cube WERD_CHOICE\n"); } - return; + word->SetupFake(unicharset); + return false; } - - // Run combiner if present, now that we're free to reuse the CubeObject. - if (tess_cube_combiner_ != NULL) { - float combiner_prob = tess_cube_combiner_->CombineResults(tess_werd_res, - cube_obj); - // If combiner probability is greater than tess/cube combiner - // classifier threshold, i.e. tesseract wins, then reset the WERD_RES - // certainty to the combiner certainty and return. Note that when - // tesseract and cube agree, the combiner probability is 1.0, so - // the final WERD_RES certainty will be maximized to 0.0. - if (combiner_prob >= - cube_cntxt_->Params()->CombinerClassifierThresh()) { - float combiner_certainty = convert_prob_to_tess_certainty(combiner_prob); - tess_werd_res->best_choice->set_certainty(combiner_certainty); - delete cube_werd_choice; - return; - } - if (cube_debug_level > 5) { - tprintf("Cube INFO: tesseract result replaced by cube: " - "%s -> %s\n", - tess_werd_res->best_choice->unichar_string().string(), - cube_best_str.c_str()); - } + if (cube_debug_level || classify_debug_level) { + tprintf("Cube result: %s r=%g, c=%g\n", + cube_werd_choice->unichar_string().string(), + cube_werd_choice->rating(), + cube_werd_choice->certainty()); } // Fill tesseract result's fields with cube results - fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), - page_res_it); + fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), word); + return true; } /********************************************************************** @@ -429,16 +455,14 @@ void Tesseract::cube_recognize( void Tesseract::fill_werd_res(const BoxWord& cube_box_word, WERD_CHOICE* cube_werd_choice, const char* cube_best_str, - PAGE_RES_IT *page_res_it) { - WERD_RES *tess_werd_res = page_res_it->word(); - + WERD_RES* tess_werd_res) { // Replace tesseract results's best choice with cube's - delete tess_werd_res->best_choice; tess_werd_res->best_choice = cube_werd_choice; + tess_werd_res->raw_choice = new WERD_CHOICE(*cube_werd_choice); delete tess_werd_res->box_word; tess_werd_res->box_word = new BoxWord(cube_box_word); - tess_werd_res->box_word->ClipToOriginalWord(page_res_it->block()->block, + tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(), tess_werd_res->word); // Fill text and remaining fields tess_werd_res->word->set_text(cube_best_str); diff --git a/ccmain/cube_reco_context.cpp b/ccmain/cube_reco_context.cpp index 0f2ff63df4..f6d960bb85 100644 --- a/ccmain/cube_reco_context.cpp +++ b/ccmain/cube_reco_context.cpp @@ -109,6 +109,7 @@ bool CubeRecoContext::GetDataFilePath(string *path) const { bool CubeRecoContext::Load(TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset) { ASSERT_HOST(tess_obj_ != NULL); + tess_unicharset_ = tess_unicharset; string data_file_path; // Get the data file path. diff --git a/ccmain/cube_reco_context.h b/ccmain/cube_reco_context.h index 822ef62ce7..811a6308be 100644 --- a/ccmain/cube_reco_context.h +++ b/ccmain/cube_reco_context.h @@ -56,6 +56,7 @@ class CubeRecoContext { // accessor functions inline const string & Lang() const { return lang_; } inline CharSet *CharacterSet() const { return char_set_; } + const UNICHARSET *TessUnicharset() const { return tess_unicharset_; } inline CharClassifier *Classifier() const { return char_classifier_; } inline WordSizeModel *SizeModel() const { return word_size_model_; } inline CharBigrams *Bigrams() const { return char_bigrams_; } @@ -135,6 +136,7 @@ class CubeRecoContext { bool loaded_; string lang_; CharSet *char_set_; + UNICHARSET *tess_unicharset_; WordSizeModel *word_size_model_; CharClassifier *char_classifier_; CharBigrams *char_bigrams_; diff --git a/ccmain/cubeclassifier.cpp b/ccmain/cubeclassifier.cpp new file mode 100644 index 0000000000..12d57f17d0 --- /dev/null +++ b/ccmain/cubeclassifier.cpp @@ -0,0 +1,136 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// Author: rays@google.com (Ray Smith) +/////////////////////////////////////////////////////////////////////// +// File: cubeclassifier.cpp +// Description: Cube implementation of a ShapeClassifier. +// Author: Ray Smith +// Created: Wed Nov 23 10:39:45 PST 2011 +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "cubeclassifier.h" + +#include "char_altlist.h" +#include "char_set.h" +#include "cube_object.h" +#include "cube_reco_context.h" +#include "tessclassifier.h" +#include "tesseractclass.h" +#include "trainingsample.h" +#include "unicharset.h" + +namespace tesseract { + +CubeClassifier::CubeClassifier(tesseract::Tesseract* tesseract) + : cube_cntxt_(tesseract->GetCubeRecoContext()), + shape_table_(*tesseract->shape_table()) { +} +CubeClassifier::~CubeClassifier() { +} + +// Classifies the given [training] sample, writing to results. +// See ShapeClassifier for a full description. +int CubeClassifier::ClassifySample(const TrainingSample& sample, + Pix* page_pix, int debug, int keep_this, + GenericVector* results) { + results->clear(); + if (page_pix == NULL) return 0; + + ASSERT_HOST(cube_cntxt_ != NULL); + const TBOX& char_box = sample.bounding_box(); + CubeObject* cube_obj = new tesseract::CubeObject( + cube_cntxt_, page_pix, char_box.left(), + pixGetHeight(page_pix) - char_box.top(), + char_box.width(), char_box.height()); + CharAltList* alt_list = cube_obj->RecognizeChar(); + alt_list->Sort(); + CharSet* char_set = cube_cntxt_->CharacterSet(); + if (alt_list != NULL) { + for (int i = 0; i < alt_list->AltCount(); ++i) { + // Convert cube representation to a shape_id. + int alt_id = alt_list->Alt(i); + int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id)); + int shape_id = shape_table_.FindShape(unichar_id, -1); + if (shape_id >= 0) + results->push_back(ShapeRating(shape_id, alt_list->AltProb(i))); + } + delete alt_list; + } + delete cube_obj; + return results->size(); +} + +// Provides access to the ShapeTable that this classifier works with. +const ShapeTable* CubeClassifier::GetShapeTable() const { + return &shape_table_; +} + +CubeTessClassifier::CubeTessClassifier(tesseract::Tesseract* tesseract) + : cube_cntxt_(tesseract->GetCubeRecoContext()), + shape_table_(*tesseract->shape_table()), + pruner_(new TessClassifier(true, tesseract)) { +} +CubeTessClassifier::~CubeTessClassifier() { + delete pruner_; +} + +// Classifies the given [training] sample, writing to results. +// See ShapeClassifier for a full description. +int CubeTessClassifier::ClassifySample(const TrainingSample& sample, + Pix* page_pix, int debug, int keep_this, + GenericVector* results) { + int num_results = pruner_->ClassifySample(sample, page_pix, debug, keep_this, + results); + if (page_pix == NULL) return num_results; + + ASSERT_HOST(cube_cntxt_ != NULL); + const TBOX& char_box = sample.bounding_box(); + CubeObject* cube_obj = new tesseract::CubeObject( + cube_cntxt_, page_pix, char_box.left(), + pixGetHeight(page_pix) - char_box.top(), + char_box.width(), char_box.height()); + CharAltList* alt_list = cube_obj->RecognizeChar(); + CharSet* char_set = cube_cntxt_->CharacterSet(); + if (alt_list != NULL) { + for (int r = 0; r < num_results; ++r) { + const Shape& shape = shape_table_.GetShape((*results)[r].shape_id); + // Get the best cube probability of all unichars in the shape. + double best_prob = 0.0; + for (int i = 0; i < alt_list->AltCount(); ++i) { + int alt_id = alt_list->Alt(i); + int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id)); + if (shape.ContainsUnichar(unichar_id) && + alt_list->AltProb(i) > best_prob) { + best_prob = alt_list->AltProb(i); + } + } + (*results)[r].rating = best_prob; + } + delete alt_list; + // Re-sort by rating. + results->sort(&ShapeRating::SortDescendingRating); + } + delete cube_obj; + return results->size(); +} + +// Provides access to the ShapeTable that this classifier works with. +const ShapeTable* CubeTessClassifier::GetShapeTable() const { + return &shape_table_; +} + +} // namespace tesseract + + + diff --git a/ccmain/cubeclassifier.h b/ccmain/cubeclassifier.h new file mode 100644 index 0000000000..6359cc51f7 --- /dev/null +++ b/ccmain/cubeclassifier.h @@ -0,0 +1,79 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// Author: rays@google.com (Ray Smith) +/////////////////////////////////////////////////////////////////////// +// File: cubeclassifier.h +// Description: Cube implementation of a ShapeClassifier. +// Author: Ray Smith +// Created: Wed Nov 23 10:36:32 PST 2011 +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ +#define THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ + +#include "shapeclassifier.h" + +namespace tesseract { + +class Classify; +class CubeRecoContext; +class ShapeTable; +class TessClassifier; +class Tesseract; +class TrainingSample; + +// Cube implementation of a ShapeClassifier. +class CubeClassifier : public ShapeClassifier { + public: + explicit CubeClassifier(Tesseract* tesseract); + virtual ~CubeClassifier(); + + // Classifies the given [training] sample, writing to results. + // See ShapeClassifier for a full description. + virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix, + int debug, int keep_this, + GenericVector* results); + // Provides access to the ShapeTable that this classifier works with. + virtual const ShapeTable* GetShapeTable() const; + + private: + // Cube objects. + CubeRecoContext* cube_cntxt_; + const ShapeTable& shape_table_; +}; + +// Combination of Tesseract class pruner with scoring by cube. +class CubeTessClassifier : public ShapeClassifier { + public: + explicit CubeTessClassifier(Tesseract* tesseract); + virtual ~CubeTessClassifier(); + + // Classifies the given [training] sample, writing to results. + // See ShapeClassifier for a full description. + virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix, + int debug, int keep_this, + GenericVector* results); + // Provides access to the ShapeTable that this classifier works with. + virtual const ShapeTable* GetShapeTable() const; + + private: + // Cube objects. + CubeRecoContext* cube_cntxt_; + const ShapeTable& shape_table_; + TessClassifier* pruner_; +}; + +} // namespace tesseract + +#endif /* THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ */ diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp index adcb697768..c8fed20fc9 100644 --- a/ccmain/docqual.cpp +++ b/ccmain/docqual.cpp @@ -81,12 +81,13 @@ inT16 Tesseract::word_outline_errs(WERD_RES *word) { inT16 i = 0; inT16 err_count = 0; - TBLOB* blob = word->rebuild_word->blobs; - - for (; blob != NULL; blob = blob->next) { - err_count += count_outline_errs(word->best_choice->unichar_string()[i], - blob->NumOutlines()); - i++; + if (word->rebuild_word != NULL) { + TBLOB* blob = word->rebuild_word->blobs; + for (; blob != NULL; blob = blob->next) { + err_count += count_outline_errs(word->best_choice->unichar_string()[i], + blob->NumOutlines()); + i++; + } } return err_count; } @@ -185,12 +186,13 @@ void Tesseract::unrej_good_quality_words( //unreject potential (float) page_res_it.row ()->char_count) <= quality_rowrej_pc)) { word = page_res_it.word (); - if (word->reject_map.quality_recoverable_rejects () && - (tessedit_unrej_any_wd || - acceptable_word_string (word->best_choice->unichar_string().string(), - word->best_choice->unichar_lengths().string()) - != AC_UNACCEPTABLE)) { - unrej_good_chs (word, page_res_it.row ()->row); + if (word->reject_map.quality_recoverable_rejects() && + (tessedit_unrej_any_wd || + acceptable_word_string(*word->uch_set, + word->best_choice->unichar_string().string(), + word->best_choice->unichar_lengths().string()) + != AC_UNACCEPTABLE)) { + unrej_good_chs(word, page_res_it.row ()->row); } page_res_it.forward (); } @@ -246,68 +248,57 @@ void Tesseract::doc_and_block_rejection( //reject big chunks inT16 char_quality = 0; inT16 accepted_char_quality; - if ((page_res_it.page_res->rej_count * 100.0 / - page_res_it.page_res->char_count) > tessedit_reject_doc_percent) { + if (page_res_it.page_res->rej_count * 100.0 / + page_res_it.page_res->char_count > tessedit_reject_doc_percent) { reject_whole_page(page_res_it); - #ifndef SECURE_NAMES if (tessedit_debug_doc_rejection) { - tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n", - page_res_it.page_res->char_count, - page_res_it.page_res->rej_count); + tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", + page_res_it.page_res->char_count, + page_res_it.page_res->rej_count); + } + } else { + if (tessedit_debug_doc_rejection) { + tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", + page_res_it.page_res->char_count, + page_res_it.page_res->rej_count); } - #endif - } - else { - #ifndef SECURE_NAMES - if (tessedit_debug_doc_rejection) - tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", - page_res_it.page_res->char_count, - page_res_it.page_res->rej_count); - #endif /* Walk blocks testing for block rejection */ - page_res_it.restart_page (); - while (page_res_it.word () != NULL) { + page_res_it.restart_page(); + WERD_RES* word; + while ((word = page_res_it.word()) != NULL) { current_block = page_res_it.block(); block_no = current_block->block->index(); - if ((page_res_it.block ()->char_count > 0) && - ((page_res_it.block ()->rej_count * 100.0 / - page_res_it.block ()->char_count) > - tessedit_reject_block_percent)) { - #ifndef SECURE_NAMES - if (tessedit_debug_block_rejection) - tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", - block_no, - page_res_it.block ()->char_count, - page_res_it.block ()->rej_count); - #endif + if (current_block->char_count > 0 && + (current_block->rej_count * 100.0 / current_block->char_count) > + tessedit_reject_block_percent) { + if (tessedit_debug_block_rejection) { + tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", + block_no, current_block->char_count, + current_block->rej_count); + } prev_word_rejected = FALSE; - while ((page_res_it.word () != NULL) && - (page_res_it.block () == current_block)) { + while ((word = page_res_it.word()) != NULL && + (page_res_it.block() == current_block)) { if (tessedit_preserve_blk_rej_perfect_wds) { - rej_word = - (page_res_it.word ()->reject_map.reject_count () > 0) - || (page_res_it.word ()->reject_map.length () < - tessedit_preserve_min_wd_len); - if (rej_word && tessedit_dont_blkrej_good_wds - && !(page_res_it.word ()->reject_map.length () < - tessedit_preserve_min_wd_len) - && - (acceptable_word_string - (page_res_it.word()->best_choice->unichar_string().string(), - page_res_it.word ()->best_choice->unichar_lengths().string()) != - AC_UNACCEPTABLE)) { - word_char_quality (page_res_it.word (), - page_res_it.row ()->row, - &char_quality, - &accepted_char_quality); - rej_word = char_quality != - page_res_it.word ()->reject_map.length (); + rej_word = word->reject_map.reject_count() > 0 || + word->reject_map.length () < tessedit_preserve_min_wd_len; + if (rej_word && tessedit_dont_blkrej_good_wds && + word->reject_map.length() >= tessedit_preserve_min_wd_len && + acceptable_word_string( + *word->uch_set, + word->best_choice->unichar_string().string(), + word->best_choice->unichar_lengths().string()) != + AC_UNACCEPTABLE) { + word_char_quality(word, page_res_it.row()->row, + &char_quality, + &accepted_char_quality); + rej_word = char_quality != word->reject_map.length(); } - } - else + } else { rej_word = TRUE; + } if (rej_word) { /* Reject spacing if both current and prev words are rejected. @@ -315,89 +306,70 @@ void Tesseract::doc_and_block_rejection( //reject big chunks generated more space errors. */ if (tessedit_use_reject_spaces && - prev_word_rejected && - (page_res_it.prev_row () == page_res_it.row ()) && - (page_res_it.word ()->word->space () == 1)) - page_res_it.word ()->reject_spaces = TRUE; - page_res_it.word ()->reject_map.rej_word_block_rej (); + prev_word_rejected && + page_res_it.prev_row() == page_res_it.row() && + word->word->space() == 1) + word->reject_spaces = TRUE; + word->reject_map.rej_word_block_rej(); } prev_word_rejected = rej_word; - page_res_it.forward (); + page_res_it.forward(); + } + } else { + if (tessedit_debug_block_rejection) { + tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", + block_no, page_res_it.block()->char_count, + page_res_it.block()->rej_count); } - } - else { - #ifndef SECURE_NAMES - if (tessedit_debug_block_rejection) - tprintf - ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", - block_no, page_res_it.block ()->char_count, - page_res_it.block ()->rej_count); - #endif /* Walk rows in block testing for row rejection */ row_no = 0; - while ((page_res_it.word () != NULL) && - (page_res_it.block () == current_block)) { - current_row = page_res_it.row (); + while ((word = page_res_it.word()) != NULL && + page_res_it.block() == current_block) { + current_row = page_res_it.row(); row_no++; /* Reject whole row if: fraction of chars on row which are rejected exceed a limit AND fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit */ - if ((page_res_it.row ()->char_count > 0) && - ((page_res_it.row ()->rej_count * 100.0 / - page_res_it.row ()->char_count) > - tessedit_reject_row_percent) && - ((page_res_it.row ()->whole_word_rej_count * 100.0 / - page_res_it.row ()->rej_count) < - tessedit_whole_wd_rej_row_percent)) { - #ifndef SECURE_NAMES - if (tessedit_debug_block_rejection) - tprintf - ("REJECTING ROW %d #chars: %d; #Rejects: %d\n", - row_no, page_res_it.row ()->char_count, - page_res_it.row ()->rej_count); - #endif + if (current_row->char_count > 0 && + (current_row->rej_count * 100.0 / current_row->char_count) > + tessedit_reject_row_percent && + (current_row->whole_word_rej_count * 100.0 / + current_row->rej_count) < + tessedit_whole_wd_rej_row_percent) { + if (tessedit_debug_block_rejection) { + tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", + row_no, current_row->char_count, + current_row->rej_count); + } prev_word_rejected = FALSE; - while ((page_res_it.word () != NULL) && - (page_res_it.row () == current_row)) { + while ((word = page_res_it.word()) != NULL && + page_res_it.row () == current_row) { /* Preserve words on good docs unless they are mostly rejected*/ if (!tessedit_row_rej_good_docs && good_quality_doc) { - rej_word = - page_res_it.word ()->reject_map. - reject_count () / - (float) page_res_it.word ()->reject_map. - length () > tessedit_good_doc_still_rowrej_wd; - } - - /* Preserve perfect words anyway */ - else if (tessedit_preserve_row_rej_perfect_wds) { - rej_word = - (page_res_it.word ()->reject_map. - reject_count () > 0) - || (page_res_it.word ()->reject_map. - length () < tessedit_preserve_min_wd_len); - if (rej_word && tessedit_dont_rowrej_good_wds - && !(page_res_it.word ()->reject_map. - length () < - tessedit_preserve_min_wd_len) - && - (acceptable_word_string - (page_res_it.word ()->best_choice-> - unichar_string().string(), - page_res_it.word ()->best_choice-> - unichar_lengths().string()) != AC_UNACCEPTABLE)) { - word_char_quality (page_res_it.word (), - page_res_it.row ()->row, - &char_quality, - &accepted_char_quality); - rej_word = char_quality != - page_res_it.word ()->reject_map.length (); + rej_word = word->reject_map.reject_count() / + static_cast(word->reject_map.length()) > + tessedit_good_doc_still_rowrej_wd; + } else if (tessedit_preserve_row_rej_perfect_wds) { + /* Preserve perfect words anyway */ + rej_word = word->reject_map.reject_count() > 0 || + word->reject_map.length () < tessedit_preserve_min_wd_len; + if (rej_word && tessedit_dont_rowrej_good_wds && + word->reject_map.length() >= tessedit_preserve_min_wd_len && + acceptable_word_string(*word->uch_set, + word->best_choice->unichar_string().string(), + word->best_choice->unichar_lengths().string()) != + AC_UNACCEPTABLE) { + word_char_quality(word, page_res_it.row()->row, + &char_quality, + &accepted_char_quality); + rej_word = char_quality != word->reject_map.length(); } - } - else + } else { rej_word = TRUE; + } if (rej_word) { /* Reject spacing if both current and prev words are rejected. @@ -405,36 +377,30 @@ void Tesseract::doc_and_block_rejection( //reject big chunks this generated more space errors. */ if (tessedit_use_reject_spaces && - prev_word_rejected && - (page_res_it.prev_row () == - page_res_it.row ()) - && (page_res_it.word ()->word->space () == - 1)) - page_res_it.word ()->reject_spaces = TRUE; - page_res_it.word ()->reject_map. - rej_word_row_rej(); + prev_word_rejected && + page_res_it.prev_row() == page_res_it.row() && + word->word->space () == 1) + word->reject_spaces = TRUE; + word->reject_map.rej_word_row_rej(); } prev_word_rejected = rej_word; - page_res_it.forward (); + page_res_it.forward(); } - } - else { - #ifndef SECURE_NAMES - if (tessedit_debug_block_rejection) - tprintf - ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", - row_no, page_res_it.row ()->char_count, - page_res_it.row ()->rej_count); - #endif - while ((page_res_it.word () != NULL) && - (page_res_it.row () == current_row)) - page_res_it.forward (); + } else { + if (tessedit_debug_block_rejection) { + tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", + row_no, current_row->char_count, current_row->rej_count); + } + while (page_res_it.word() != NULL && + page_res_it.row() == current_row) + page_res_it.forward(); } } } } } } + } // namespace tesseract @@ -463,15 +429,20 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { BOOL8 found_terrible_word = FALSE; BOOL8 ok_dict_word; - page_res_it.restart_page (); - while (page_res_it.word () != NULL) { - word = page_res_it.word (); + page_res_it.restart_page(); + while (page_res_it.word() != NULL) { + POLY_BLOCK* pb = page_res_it.block()->block->poly_block(); + if (pb != NULL && !pb->IsText()) { + page_res_it.forward(); + continue; + } + word = page_res_it.word(); if (crunch_early_convert_bad_unlv_chs) convert_bad_unlv_chs(word); if (crunch_early_merge_tess_fails) - merge_tess_fails(word); + word->merge_tess_fails(); if (word->reject_map.accept_count () != 0) { found_terrible_word = FALSE; @@ -479,7 +450,7 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { prev_potential_marked = FALSE; } else { - ok_dict_word = safe_dict_word(*(word->best_choice)); + ok_dict_word = safe_dict_word(word); garbage_level = garbage_word (word, ok_dict_word); if ((garbage_level != G_NEVER_CRUNCH) && @@ -584,47 +555,44 @@ BOOL8 Tesseract::potential_word_crunch(WERD_RES *word, BOOL8 word_crunchable; int poor_indicator_count = 0; - word_crunchable = - !crunch_leave_accept_strings || - (word->reject_map.length () < 3) || - ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) && - !ok_dict_word); + word_crunchable = !crunch_leave_accept_strings || + word->reject_map.length() < 3 || + (acceptable_word_string(*word->uch_set, + str, lengths) == AC_UNACCEPTABLE && + !ok_dict_word); - adjusted_len = word->reject_map.length (); + adjusted_len = word->reject_map.length(); if (adjusted_len > 10) adjusted_len = 10; - rating_per_ch = word->best_choice->rating () / adjusted_len; + rating_per_ch = word->best_choice->rating() / adjusted_len; if (rating_per_ch > crunch_pot_poor_rate) { if (crunch_debug > 2) { - tprintf ("Potential poor rating on \"%s\"\n", - word->best_choice->unichar_string().string()); + tprintf("Potential poor rating on \"%s\"\n", + word->best_choice->unichar_string().string()); } poor_indicator_count++; } if (word_crunchable && - (word->best_choice->certainty () < crunch_pot_poor_cert)) { + word->best_choice->certainty() < crunch_pot_poor_cert) { if (crunch_debug > 2) { - tprintf ("Potential poor cert on \"%s\"\n", - word->best_choice->unichar_string().string()); + tprintf("Potential poor cert on \"%s\"\n", + word->best_choice->unichar_string().string()); } poor_indicator_count++; } if (garbage_level != G_OK) { if (crunch_debug > 2) { - tprintf ("Potential garbage on \"%s\"\n", - word->best_choice->unichar_string().string()); + tprintf("Potential garbage on \"%s\"\n", + word->best_choice->unichar_string().string()); } poor_indicator_count++; } - return (poor_indicator_count >= crunch_pot_indicators); + return poor_indicator_count >= crunch_pot_indicators; } -} // namespace tesseract - -namespace tesseract { void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { WERD_RES *word; PAGE_RES_IT copy_it; @@ -635,9 +603,9 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { inT16 x_debug_delete_mode; CRUNCH_MODE x_delete_mode; - page_res_it.restart_page (); - while (page_res_it.word () != NULL) { - word = page_res_it.word (); + page_res_it.restart_page(); + while (page_res_it.word() != NULL) { + word = page_res_it.word(); delete_mode = word_deletable (word, debug_delete_mode); if (delete_mode != CR_NONE) { @@ -649,10 +617,9 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { } word->unlv_crunch_mode = delete_mode; deleting_from_bol = TRUE; - } - else if (word->word->flag (W_EOL)) { + } else if (word->word->flag(W_EOL)) { if (marked_delete_point) { - while (copy_it.word () != word) { + while (copy_it.word() != word) { x_delete_mode = word_deletable (copy_it.word (), x_debug_delete_mode); if (crunch_debug > 0) { @@ -690,7 +657,7 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { determine if the word is deletable. */ if (!crunch_early_merge_tess_fails) - merge_tess_fails(word); + word->merge_tess_fails(); page_res_it.forward (); } } @@ -698,10 +665,10 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { int i; - UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-"); - UNICHAR_ID unichar_space = unicharset.unichar_to_id(" "); - UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~"); - UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^"); + UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); + UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" "); + UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~"); + UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^"); bool modified = false; for (i = 0; i < word_res->reject_map.length(); ++i) { if (word_res->best_choice->unichar_id(i) == unichar_tilde) { @@ -718,37 +685,7 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { } } if (modified) { - word_res->best_choice->populate_unichars(unicharset); - } -} - -// Callback helper for merge_tess_fails returns a space if both -// arguments are space, otherwise INVALID_UNICHAR_ID. -UNICHAR_ID Tesseract::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) { - if (id1 == id2 && id1 == unicharset.unichar_to_id(" ")) - return id1; - else - return INVALID_UNICHAR_ID; -} - -// Change pairs of tess failures to a single one -void Tesseract::merge_tess_fails(WERD_RES *word_res) { - if (word_res->ConditionalBlobMerge( - unicharset, - NewPermanentTessCallback(this, &Tesseract::BothSpaces), NULL, - word_res->best_choice->blob_choices())) { - if (crunch_debug) { - tprintf("Post:bc len=%d, rejmap=%d, boxword=%d, chopword=%d," - " rebuild=%d\n", - word_res->best_choice->length(), - word_res->reject_map.length(), - word_res->box_word->length(), - word_res->chopped_word->NumBlobs(), - word_res->rebuild_word->NumBlobs()); - } - int len = word_res->best_choice->length(); - ASSERT_HOST(word_res->reject_map.length() == len); - ASSERT_HOST(word_res->box_word->length() == len); + word_res->best_choice->populate_unichars(); } } @@ -785,7 +722,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { for (; *str != '\0'; str += *(lengths++)) { len++; - if (unicharset.get_isupper (str, *lengths)) { + if (word->uch_set->get_isupper (str, *lengths)) { total_alpha_count++; switch (state) { case SUBSEQUENT_UPPER: @@ -794,14 +731,14 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { upper_string_count++; if (longest_upper_run_len < upper_string_count) longest_upper_run_len = upper_string_count; - if (last_char == unicharset.unichar_to_id(str, *lengths)) { + if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { alpha_repetition_count++; if (longest_alpha_repetition_count < alpha_repetition_count) { longest_alpha_repetition_count = alpha_repetition_count; } } else { - last_char = unicharset.unichar_to_id(str, *lengths); + last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; } break; @@ -809,13 +746,13 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { isolated_digits++; default: state = FIRST_UPPER; - last_char = unicharset.unichar_to_id(str, *lengths); + last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; upper_string_count = 1; break; } } - else if (unicharset.get_islower (str, *lengths)) { + else if (word->uch_set->get_islower (str, *lengths)) { total_alpha_count++; switch (state) { case SUBSEQUENT_LOWER: @@ -824,14 +761,14 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { lower_string_count++; if (longest_lower_run_len < lower_string_count) longest_lower_run_len = lower_string_count; - if (last_char == unicharset.unichar_to_id(str, *lengths)) { + if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { alpha_repetition_count++; if (longest_alpha_repetition_count < alpha_repetition_count) { longest_alpha_repetition_count = alpha_repetition_count; } } else { - last_char = unicharset.unichar_to_id(str, *lengths); + last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; } break; @@ -839,13 +776,13 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { isolated_digits++; default: state = FIRST_LOWER; - last_char = unicharset.unichar_to_id(str, *lengths); + last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; lower_string_count = 1; break; } } - else if (unicharset.get_isdigit (str, *lengths)) { + else if (word->uch_set->get_isdigit (str, *lengths)) { total_digit_count++; switch (state) { case FIRST_NUM: @@ -894,56 +831,56 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { total_alpha_count += total_digit_count - isolated_digits; } - if (crunch_leave_ok_strings && - (len >= 4) && - (2 * (total_alpha_count - isolated_alphas) > len) && - (longest_alpha_repetition_count < crunch_long_repetitions)) { + if (crunch_leave_ok_strings && len >= 4 && + 2 * (total_alpha_count - isolated_alphas) > len && + longest_alpha_repetition_count < crunch_long_repetitions) { if ((crunch_accept_ok && - (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) || - (longest_lower_run_len > crunch_leave_lc_strings) || - (longest_upper_run_len > crunch_leave_uc_strings)) + acceptable_word_string(*word->uch_set, str, lengths) != + AC_UNACCEPTABLE) || + longest_lower_run_len > crunch_leave_lc_strings || + longest_upper_run_len > crunch_leave_uc_strings) return G_NEVER_CRUNCH; } - if ((word->reject_map.length () > 1) && - (strpbrk (str, " ") == NULL) && - ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || - (word->best_choice->permuter () == FREQ_DAWG_PERM) || - (word->best_choice->permuter () == USER_DAWG_PERM) || - (word->best_choice->permuter () == NUMBER_PERM) || - (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word)) + if (word->reject_map.length() > 1 && + strpbrk(str, " ") == NULL && + (word->best_choice->permuter() == SYSTEM_DAWG_PERM || + word->best_choice->permuter() == FREQ_DAWG_PERM || + word->best_choice->permuter() == USER_DAWG_PERM || + word->best_choice->permuter() == NUMBER_PERM || + acceptable_word_string(*word->uch_set, str, lengths) != + AC_UNACCEPTABLE || ok_dict_word)) return G_OK; ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs; if (crunch_debug > 3) { - tprintf ("garbage_word: \"%s\"\n", - word->best_choice->unichar_string().string()); - tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", - len, - bad_char_count, isolated_digits, isolated_alphas, tess_rejs); + tprintf("garbage_word: \"%s\"\n", + word->best_choice->unichar_string().string()); + tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", + len, + bad_char_count, isolated_digits, isolated_alphas, tess_rejs); } - if ((bad_char_count == 0) && - (tess_rejs == 0) && - ((len > isolated_digits + isolated_alphas) || (len <= 2))) + if (bad_char_count == 0 && + tess_rejs == 0 && + (len > isolated_digits + isolated_alphas || len <= 2)) return G_OK; - if ((tess_rejs > ok_chars) || - ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len))) + if (tess_rejs > ok_chars || + (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) return G_TERRIBLE; if (len > 4) { - dodgy_chars = 2 * tess_rejs + bad_char_count + - isolated_digits + isolated_alphas; - if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5)) + dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + + isolated_alphas; + if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5) return G_DODGY; else return G_OK; - } - else { + } else { dodgy_chars = 2 * tess_rejs + bad_char_count; - if (((len == 4) && (dodgy_chars > 2)) || - ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len)) + if ((len == 4 && dodgy_chars > 2) || + (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) return G_DODGY; else return G_OK; @@ -982,15 +919,18 @@ CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) { return CR_DELETE; } - box = word->rebuild_word->bounding_box(); - if (box.height () < crunch_del_min_ht * kBlnXHeight) { - delete_mode = 4; - return CR_DELETE; - } + if (word->rebuild_word != NULL) { + // Cube leaves rebuild_word NULL. + box = word->rebuild_word->bounding_box(); + if (box.height () < crunch_del_min_ht * kBlnXHeight) { + delete_mode = 4; + return CR_DELETE; + } - if (noise_outlines(word->rebuild_word)) { - delete_mode = 5; - return CR_DELETE; + if (noise_outlines(word->rebuild_word)) { + delete_mode = 5; + return CR_DELETE; + } } if ((failure_count (word) * 1.5) > word_len) { diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp index 4c1adaf299..494e54d510 100644 --- a/ccmain/fixspace.cpp +++ b/ccmain/fixspace.cpp @@ -204,7 +204,8 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if ((!word->part_of_combo) && (word->box_word == NULL)) { - classify_word_pass2(word, block, row); + classify_word_and_language(&Tesseract::classify_word_pass2, + block, row, word); } prev_word_best_choice_ = word->best_choice; } @@ -347,7 +348,7 @@ BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) { for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]); return ( - unicharset.get_isdigit( + word->uch_set->get_isdigit( word->best_choice->unichar_string().string() + offset, word->best_choice->unichar_lengths()[i]) || (word->best_choice->permuter() == NUMBER_PERM && @@ -771,6 +772,9 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res, float small_limit = kBlnXHeight * fixsp_small_outlines_size; float non_noise_limit = kBlnXHeight * 0.8; + if (word_res->rebuild_word == NULL) + return -1; // Can't handle cube words. + TBLOB* blob = word_res->rebuild_word->blobs; // Normalised. int blob_count = word_res->box_word->length(); @@ -917,15 +921,17 @@ inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); + if (word->rebuild_word == NULL) + continue; // Can't handle cube words. word_length = word->reject_map.length(); if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM || - safe_dict_word(*word->best_choice) > 0) { + safe_dict_word(word) > 0) { TBLOB* blob = word->rebuild_word->blobs; - UNICHAR_ID space = getDict().getUnicharset().unichar_to_id(" "); + UNICHAR_ID space = word->uch_set->unichar_to_id(" "); for (i = 0; i < word->best_choice->length() && blob != NULL; ++i, blob = blob->next) { if (word->best_choice->unichar_id(i) == space || diff --git a/ccmain/ltrresultiterator.cpp b/ccmain/ltrresultiterator.cpp new file mode 100644 index 0000000000..7659b7f3df --- /dev/null +++ b/ccmain/ltrresultiterator.cpp @@ -0,0 +1,369 @@ +/////////////////////////////////////////////////////////////////////// +// File: ltrresultiterator.cpp +// Description: Iterator for tesseract results in strict left-to-right +// order that avoids using tesseract internal data structures. +// Author: Ray Smith +// Created: Fri Feb 26 14:32:09 PST 2010 +// +// (C) Copyright 2010, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "ltrresultiterator.h" + +#include "allheaders.h" +#include "pageres.h" +#include "strngs.h" +#include "tesseractclass.h" + +namespace tesseract { + +LTRResultIterator::LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract, + int scale, int scaled_yres, + int rect_left, int rect_top, + int rect_width, int rect_height) + : PageIterator(page_res, tesseract, scale, scaled_yres, + rect_left, rect_top, rect_width, rect_height), + line_separator_("\n"), + paragraph_separator_("\n") { +} + +LTRResultIterator::~LTRResultIterator() { +} + +// Returns the null terminated UTF-8 encoded text string for the current +// object at the given level. Use delete [] to free after use. +char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const { + if (it_->word() == NULL) return NULL; // Already at the end! + STRING text; + PAGE_RES_IT res_it(*it_); + WERD_CHOICE* best_choice = res_it.word()->best_choice; + ASSERT_HOST(best_choice != NULL); + if (level == RIL_SYMBOL) { + text = res_it.word()->BestUTF8(blob_index_, false); + } else if (level == RIL_WORD) { + text = best_choice->unichar_string(); + } else { + bool eol = false; // end of line? + bool eop = false; // end of paragraph? + do { // for each paragraph in a block + do { // for each text line in a paragraph + do { // for each word in a text line + best_choice = res_it.word()->best_choice; + ASSERT_HOST(best_choice != NULL); + text += best_choice->unichar_string(); + text += " "; + res_it.forward(); + eol = res_it.row() != res_it.prev_row(); + } while (!eol); + text.truncate_at(text.length() - 1); + text += line_separator_; + eop = res_it.block() != res_it.prev_block() || + res_it.row()->row->para() != res_it.prev_row()->row->para(); + } while (level != RIL_TEXTLINE && !eop); + if (eop) text += paragraph_separator_; + } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block()); + } + int length = text.length() + 1; + char* result = new char[length]; + strncpy(result, text.string(), length); + return result; +} + +// Set the string inserted at the end of each text line. "\n" by default. +void LTRResultIterator::SetLineSeparator(const char *new_line) { + line_separator_ = new_line; +} + +// Set the string inserted at the end of each paragraph. "\n" by default. +void LTRResultIterator::SetParagraphSeparator(const char *new_para) { + paragraph_separator_ = new_para; +} + +// Returns the mean confidence of the current object at the given level. +// The number should be interpreted as a percent probability. (0.0f-100.0f) +float LTRResultIterator::Confidence(PageIteratorLevel level) const { + if (it_->word() == NULL) return 0.0f; // Already at the end! + float mean_certainty = 0.0f; + int certainty_count = 0; + PAGE_RES_IT res_it(*it_); + WERD_CHOICE* best_choice = res_it.word()->best_choice; + ASSERT_HOST(best_choice != NULL); + switch (level) { + case RIL_BLOCK: + do { + best_choice = res_it.word()->best_choice; + ASSERT_HOST(best_choice != NULL); + mean_certainty += best_choice->certainty(); + ++certainty_count; + res_it.forward(); + } while (res_it.block() == res_it.prev_block()); + break; + case RIL_PARA: + do { + best_choice = res_it.word()->best_choice; + ASSERT_HOST(best_choice != NULL); + mean_certainty += best_choice->certainty(); + ++certainty_count; + res_it.forward(); + } while (res_it.block() == res_it.prev_block() && + res_it.row()->row->para() == res_it.prev_row()->row->para()); + break; + case RIL_TEXTLINE: + do { + best_choice = res_it.word()->best_choice; + ASSERT_HOST(best_choice != NULL); + mean_certainty += best_choice->certainty(); + ++certainty_count; + res_it.forward(); + } while (res_it.row() == res_it.prev_row()); + break; + case RIL_WORD: + mean_certainty += best_choice->certainty(); + ++certainty_count; + break; + case RIL_SYMBOL: + BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices(); + if (choices != NULL) { + BLOB_CHOICE_LIST_C_IT blob_choices_it(choices); + for (int blob = 0; blob < blob_index_; ++blob) + blob_choices_it.forward(); + BLOB_CHOICE_IT choice_it(blob_choices_it.data()); + for (choice_it.mark_cycle_pt(); + !choice_it.cycled_list(); + choice_it.forward()) { + if (choice_it.data()->unichar_id() == + best_choice->unichar_id(blob_index_)) + break; + } + mean_certainty += choice_it.data()->certainty(); + } else { + mean_certainty += best_choice->certainty(); + } + ++certainty_count; + } + if (certainty_count > 0) { + mean_certainty /= certainty_count; + float confidence = 100 + 5 * mean_certainty; + if (confidence < 0.0f) confidence = 0.0f; + if (confidence > 100.0f) confidence = 100.0f; + return confidence; + } + return 0.0f; +} + +// Returns the font attributes of the current word. If iterating at a higher +// level object than words, eg textlines, then this will return the +// attributes of the first word in that textline. +// The actual return value is a string representing a font name. It points +// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as +// the iterator itself, ie rendered invalid by various members of +// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. +// Pointsize is returned in printers points (1/72 inch.) +const char* LTRResultIterator::WordFontAttributes(bool* is_bold, + bool* is_italic, + bool* is_underlined, + bool* is_monospace, + bool* is_serif, + bool* is_smallcaps, + int* pointsize, + int* font_id) const { + if (it_->word() == NULL) return NULL; // Already at the end! + if (it_->word()->fontinfo == NULL) { + *font_id = -1; + return NULL; // No font information. + } + const FontInfo& font_info = *it_->word()->fontinfo; + *font_id = font_info.universal_id; + *is_bold = font_info.is_bold(); + *is_italic = font_info.is_italic(); + *is_underlined = false; // TODO(rays) fix this! + *is_monospace = font_info.is_fixed_pitch(); + *is_serif = font_info.is_serif(); + *is_smallcaps = it_->word()->small_caps; + float row_height = it_->row()->row->x_height() + + it_->row()->row->ascenders() - it_->row()->row->descenders(); + // Convert from pixels to printers points. + *pointsize = scaled_yres_ > 0 + ? static_cast(row_height * kPointsPerInch / scaled_yres_ + 0.5) + : 0; + + return font_info.name; +} + +// Returns the name of the language used to recognize this word. +const char* LTRResultIterator::WordRecognitionLanguage() const { + if (it_->word() == NULL || it_->word()->tesseract == NULL) return NULL; + return it_->word()->tesseract->lang.string(); +} + +// Return the overall directionality of this word. +StrongScriptDirection LTRResultIterator::WordDirection() const { + if (it_->word() == NULL) return DIR_NEUTRAL; + bool has_rtl = it_->word()->AnyRtlCharsInWord(); + bool has_ltr = it_->word()->AnyLtrCharsInWord(); + if (has_rtl && !has_ltr) + return DIR_RIGHT_TO_LEFT; + if (has_ltr && !has_rtl) + return DIR_LEFT_TO_RIGHT; + if (!has_ltr && !has_rtl) + return DIR_NEUTRAL; + return DIR_MIX; +} + +// Returns true if the current word was found in a dictionary. +bool LTRResultIterator::WordIsFromDictionary() const { + if (it_->word() == NULL) return false; // Already at the end! + int permuter = it_->word()->best_choice->permuter(); + return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || + permuter == USER_DAWG_PERM; +} + +// Returns true if the current word is numeric. +bool LTRResultIterator::WordIsNumeric() const { + if (it_->word() == NULL) return false; // Already at the end! + int permuter = it_->word()->best_choice->permuter(); + return permuter == NUMBER_PERM; +} + +// Returns true if the word contains blamer information. +bool LTRResultIterator::HasBlamerInfo() const { + return (it_->word() != NULL && it_->word()->blamer_bundle != NULL && + (it_->word()->blamer_bundle->debug.length() > 0 || + it_->word()->blamer_bundle->misadaption_debug.length() > 0)); +} + +// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle +// of the current word. +void *LTRResultIterator::GetParamsTrainingBundle() const { + return (it_->word() != NULL && it_->word()->blamer_bundle != NULL) ? + &(it_->word()->blamer_bundle->params_training_bundle) : NULL; +} + +// Returns the pointer to the string with blamer information for this word. +// Assumes that the word's blamer_bundle is not NULL. +const char *LTRResultIterator::GetBlamerDebug() const { + return it_->word()->blamer_bundle->debug.string(); +} + +// Returns the pointer to the string with misadaption information for this word. +// Assumes that the word's blamer_bundle is not NULL. +const char *LTRResultIterator::GetBlamerMisadaptionDebug() const { + return it_->word()->blamer_bundle->misadaption_debug.string(); +} + +// Returns the null terminated UTF-8 encoded truth string for the current word. +// Use delete [] to free after use. +char* LTRResultIterator::WordTruthUTF8Text() const { + if (it_->word() == NULL) return NULL; // Already at the end! + if (it_->word()->blamer_bundle == NULL || + it_->word()->blamer_bundle->incorrect_result_reason == IRR_NO_TRUTH) { + return NULL; // no truth information for this word + } + const GenericVector &truth_vec = + it_->word()->blamer_bundle->truth_text; + STRING truth_text; + for (int i = 0; i < truth_vec.size(); ++i) truth_text += truth_vec[i]; + int length = truth_text.length() + 1; + char* result = new char[length]; + strncpy(result, truth_text.string(), length); + return result; +} + +// Returns a pointer to serialized choice lattice. +// Fills lattice_size with the number of bytes in lattice data. +const char *LTRResultIterator::WordLattice(int *lattice_size) const { + if (it_->word() == NULL) return NULL; // Already at the end! + if (it_->word()->blamer_bundle == NULL) return NULL; + *lattice_size = it_->word()->blamer_bundle->lattice_size; + return it_->word()->blamer_bundle->lattice_data; +} + +// Returns true if the current symbol is a superscript. +// If iterating at a higher level object than symbols, eg words, then +// this will return the attributes of the first symbol in that word. +bool LTRResultIterator::SymbolIsSuperscript() const { + if (cblob_it_ == NULL && it_->word() != NULL) + return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT; + return false; +} + +// Returns true if the current symbol is a subscript. +// If iterating at a higher level object than symbols, eg words, then +// this will return the attributes of the first symbol in that word. +bool LTRResultIterator::SymbolIsSubscript() const { + if (cblob_it_ == NULL && it_->word() != NULL) + return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT; + return false; +} + +// Returns true if the current symbol is a dropcap. +// If iterating at a higher level object than symbols, eg words, then +// this will return the attributes of the first symbol in that word. +bool LTRResultIterator::SymbolIsDropcap() const { + if (cblob_it_ == NULL && it_->word() != NULL) + return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP; + return false; +} + +ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) { + ASSERT_HOST(result_it.it_->word() != NULL); + word_res_ = result_it.it_->word(); + PAGE_RES_IT res_it(*result_it.it_); + WERD_CHOICE* best_choice = word_res_->best_choice; + BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices(); + if (choices != NULL) { + BLOB_CHOICE_LIST_C_IT blob_choices_it(choices); + for (int blob = 0; blob < result_it.blob_index_; ++blob) + blob_choices_it.forward(); + choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data()); + choice_it_->mark_cycle_pt(); + } else { + choice_it_ = NULL; + } +} + +ChoiceIterator::~ChoiceIterator() { + delete choice_it_; +} + +// Moves to the next choice for the symbol and returns false if there +// are none left. +bool ChoiceIterator::Next() { + if (choice_it_ == NULL) + return false; + choice_it_->forward(); + return !choice_it_->cycled_list(); +} + +// Returns the null terminated UTF-8 encoded text string for the current +// choice. Use delete [] to free after use. +const char* ChoiceIterator::GetUTF8Text() const { + if (choice_it_ == NULL) + return NULL; + UNICHAR_ID id = choice_it_->data()->unichar_id(); + return word_res_->BestUTF8(id, false); +} + +// Returns the confidence of the current choice. +// The number should be interpreted as a percent probability. (0.0f-100.0f) +float ChoiceIterator::Confidence() const { + if (choice_it_ == NULL) + return 0.0f; + float confidence = 100 + 5 * choice_it_->data()->certainty(); + if (confidence < 0.0f) confidence = 0.0f; + if (confidence > 100.0f) confidence = 100.0f; + return confidence; +} + + +} // namespace tesseract. diff --git a/ccmain/ltrresultiterator.h b/ccmain/ltrresultiterator.h new file mode 100644 index 0000000000..e563dd42ef --- /dev/null +++ b/ccmain/ltrresultiterator.h @@ -0,0 +1,202 @@ +/////////////////////////////////////////////////////////////////////// +// File: ltrresultiterator.h +// Description: Iterator for tesseract results in strict left-to-right +// order that avoids using tesseract internal data structures. +// Author: Ray Smith +// Created: Fri Feb 26 11:01:06 PST 2010 +// +// (C) Copyright 2010, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__ +#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__ + +#include "pageiterator.h" +#include "unicharset.h" + +class BLOB_CHOICE_IT; +class WERD_RES; + +namespace tesseract { + +class Tesseract; + +// Class to iterate over tesseract results, providing access to all levels +// of the page hierarchy, without including any tesseract headers or having +// to handle any tesseract structures. +// WARNING! This class points to data held within the TessBaseAPI class, and +// therefore can only be used while the TessBaseAPI class still exists and +// has not been subjected to a call of Init, SetImage, Recognize, Clear, End +// DetectOS, or anything else that changes the internal PAGE_RES. +// See apitypes.h for the definition of PageIteratorLevel. +// See also base class PageIterator, which contains the bulk of the interface. +// LTRResultIterator adds text-specific methods for access to OCR output. + +class LTRResultIterator : public PageIterator { + friend class ChoiceIterator; + public: + // page_res and tesseract come directly from the BaseAPI. + // The rectangle parameters are copied indirectly from the Thresholder, + // via the BaseAPI. They represent the coordinates of some rectangle in an + // original image (in top-left-origin coordinates) and therefore the top-left + // needs to be added to any output boxes in order to specify coordinates + // in the original image. See TessBaseAPI::SetRectangle. + // The scale and scaled_yres are in case the Thresholder scaled the image + // rectangle prior to thresholding. Any coordinates in tesseract's image + // must be divided by scale before adding (rect_left, rect_top). + // The scaled_yres indicates the effective resolution of the binary image + // that tesseract has been given by the Thresholder. + // After the constructor, Begin has already been called. + LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract, + int scale, int scaled_yres, + int rect_left, int rect_top, + int rect_width, int rect_height); + virtual ~LTRResultIterator(); + + // LTRResultIterators may be copied! This makes it possible to iterate over + // all the objects at a lower level, while maintaining an iterator to + // objects at a higher level. These constructors DO NOT CALL Begin, so + // iterations will continue from the location of src. + // TODO: For now the copy constructor and operator= only need the base class + // versions, but if new data members are added, don't forget to add them! + + // ============= Moving around within the page ============. + + // See PageIterator. + + // ============= Accessing data ==============. + + // Returns the null terminated UTF-8 encoded text string for the current + // object at the given level. Use delete [] to free after use. + char* GetUTF8Text(PageIteratorLevel level) const; + + // Set the string inserted at the end of each text line. "\n" by default. + void SetLineSeparator(const char *new_line); + + // Set the string inserted at the end of each paragraph. "\n" by default. + void SetParagraphSeparator(const char *new_para); + + // Returns the mean confidence of the current object at the given level. + // The number should be interpreted as a percent probability. (0.0f-100.0f) + float Confidence(PageIteratorLevel level) const; + + // ============= Functions that refer to words only ============. + + // Returns the font attributes of the current word. If iterating at a higher + // level object than words, eg textlines, then this will return the + // attributes of the first word in that textline. + // The actual return value is a string representing a font name. It points + // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as + // the iterator itself, ie rendered invalid by various members of + // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. + // Pointsize is returned in printers points (1/72 inch.) + const char* WordFontAttributes(bool* is_bold, + bool* is_italic, + bool* is_underlined, + bool* is_monospace, + bool* is_serif, + bool* is_smallcaps, + int* pointsize, + int* font_id) const; + + // Return the name of the language used to recognize this word. + // On error, NULL. Do not delete this pointer. + const char* WordRecognitionLanguage() const; + + // Return the overall directionality of this word. + StrongScriptDirection WordDirection() const; + + // Returns true if the current word was found in a dictionary. + bool WordIsFromDictionary() const; + + // Returns true if the current word is numeric. + bool WordIsNumeric() const; + + // Returns true if the word contains blamer information. + bool HasBlamerInfo() const; + + // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle + // of the current word. + void *GetParamsTrainingBundle() const; + + // Returns a pointer to the string with blamer information for this word. + // Assumes that the word's blamer_bundle is not NULL. + const char *GetBlamerDebug() const; + + // Returns a pointer to the string with misadaption information for this word. + // Assumes that the word's blamer_bundle is not NULL. + const char *GetBlamerMisadaptionDebug() const; + + // Returns a null terminated UTF-8 encoded truth string for the current word. + // Use delete [] to free after use. + char* WordTruthUTF8Text() const; + + // Returns a pointer to serialized choice lattice. + // Fills lattice_size with the number of bytes in lattice data. + const char *WordLattice(int *lattice_size) const; + + // ============= Functions that refer to symbols only ============. + + // Returns true if the current symbol is a superscript. + // If iterating at a higher level object than symbols, eg words, then + // this will return the attributes of the first symbol in that word. + bool SymbolIsSuperscript() const; + // Returns true if the current symbol is a subscript. + // If iterating at a higher level object than symbols, eg words, then + // this will return the attributes of the first symbol in that word. + bool SymbolIsSubscript() const; + // Returns true if the current symbol is a dropcap. + // If iterating at a higher level object than symbols, eg words, then + // this will return the attributes of the first symbol in that word. + bool SymbolIsDropcap() const; + + protected: + const char *line_separator_; + const char *paragraph_separator_; +}; + +// Class to iterate over the classifier choices for a single RIL_SYMBOL. +class ChoiceIterator { + public: + // Construction is from a LTRResultIterator that points to the symbol of + // interest. The ChoiceIterator allows a one-shot iteration over the + // choices for this symbol and after that is is useless. + explicit ChoiceIterator(const LTRResultIterator& result_it); + ~ChoiceIterator(); + + // Moves to the next choice for the symbol and returns false if there + // are none left. + bool Next(); + + // ============= Accessing data ==============. + + // Returns the null terminated UTF-8 encoded text string for the current + // choice. + // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an + // internal structure and should NOT be delete[]ed to free after use. + const char* GetUTF8Text() const; + + // Returns the confidence of the current choice. + // The number should be interpreted as a percent probability. (0.0f-100.0f) + float Confidence() const; + + private: + // Pointer to the WERD_RES object owned by the API. + WERD_RES* word_res_; + // Iterator over the blob choices. + BLOB_CHOICE_IT* choice_it_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__ diff --git a/ccmain/mutableiterator.h b/ccmain/mutableiterator.h new file mode 100644 index 0000000000..f097f47e2b --- /dev/null +++ b/ccmain/mutableiterator.h @@ -0,0 +1,64 @@ +/////////////////////////////////////////////////////////////////////// +// File: mutableiterator.h +// Description: Iterator for tesseract results providing access to +// both high-level API and Tesseract internal data structures. +// Author: David Eger +// Created: Thu Feb 24 19:01:06 PST 2011 +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H__ +#define TESSERACT_CCMAIN_MUTABLEITERATOR_H__ + +#include "resultiterator.h" + +class BLOB_CHOICE_IT; + +namespace tesseract { + +class Tesseract; + +// Class to iterate over tesseract results, providing access to all levels +// of the page hierarchy, without including any tesseract headers or having +// to handle any tesseract structures. +// WARNING! This class points to data held within the TessBaseAPI class, and +// therefore can only be used while the TessBaseAPI class still exists and +// has not been subjected to a call of Init, SetImage, Recognize, Clear, End +// DetectOS, or anything else that changes the internal PAGE_RES. +// See apitypes.h for the definition of PageIteratorLevel. +// See also base class PageIterator, which contains the bulk of the interface. +// ResultIterator adds text-specific methods for access to OCR output. +// MutableIterator adds access to internal data structures. + +class MutableIterator : public ResultIterator { + public: + // See argument descriptions in ResultIterator() + MutableIterator(PAGE_RES* page_res, Tesseract* tesseract, + int scale, int scaled_yres, + int rect_left, int rect_top, + int rect_width, int rect_height) + : ResultIterator( + LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left, + rect_top, rect_width, rect_height)) {} + virtual ~MutableIterator() {} + + // See PageIterator and ResultIterator for most calls. + + // Return access to Tesseract internals. + const PAGE_RES_IT *PageResIt() const { return it_; } +}; + +} // namespace tesseract. + +#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H__ diff --git a/ccmain/osdetect.cpp b/ccmain/osdetect.cpp index 0052468faf..c31c9b1b68 100644 --- a/ccmain/osdetect.cpp +++ b/ccmain/osdetect.cpp @@ -22,6 +22,7 @@ #include "blobbox.h" #include "blread.h" #include "colfind.h" +#include "fontinfo.h" #include "imagefind.h" #include "linefind.h" #include "oldlist.h" @@ -113,6 +114,49 @@ void OSResults::update_best_script(int orientation) { (first / second - 1.0) / (kScriptAcceptRatio - 1.0); } +int OSResults::get_best_script(int orientation_id) const { + int max_id = -1; + for (int j = 0; j < kMaxNumberOfScripts; ++j) { + const char *script = unicharset->get_script_from_script_id(j); + if (strcmp(script, "Common") && strcmp(script, "NULL")) { + if (max_id == -1 || + scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id]) + max_id = j; + } + } + return max_id; +} + +// Print the script scores for all possible orientations. +void OSResults::print_scores(void) const { + for (int i = 0; i < 4; ++i) { + printf("Orientation id #%d", i); + print_scores(i); + } +} + +// Print the script scores for the given candidate orientation. +void OSResults::print_scores(int orientation_id) const { + for (int j = 0; j < kMaxNumberOfScripts; ++j) { + if (scripts_na[orientation_id][j]) { + printf("%12s\t: %f\n", unicharset->get_script_from_script_id(j), + scripts_na[orientation_id][j]); + } + } +} + +// Accumulate scores with given OSResults instance and update the best script. +void OSResults::accumulate(const OSResults& osr) { + for (int i = 0; i < 4; ++i) { + orientations[i] += osr.orientations[i]; + for (int j = 0; j < kMaxNumberOfScripts; ++j) + scripts_na[i][j] += osr.scripts_na[i][j]; + } + unicharset = osr.unicharset; + update_best_orientation(); + update_best_script(best_result.orientation_id); +} + // Detect and erase horizontal/vertical lines and picture regions from the // image, so that non-text blobs are removed from consideration. void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks, @@ -123,18 +167,18 @@ void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks, int vertical_y = 1; tesseract::TabVector_LIST v_lines; tesseract::TabVector_LIST h_lines; - Boxa* boxa = NULL; - Pixa* pixa = NULL; const int kMinCredibleResolution = 70; int resolution = (kMinCredibleResolution > pixGetXRes(pix)) ? kMinCredibleResolution : pixGetXRes(pix); - tesseract::LineFinder::FindVerticalLines(resolution, pix, &vertical_x, - &vertical_y, &v_lines); - tesseract::LineFinder::FindHorizontalLines(resolution, pix, &h_lines); - tesseract::ImageFinder::FindImages(pix, &boxa, &pixa); - pixaDestroy(&pixa); - boxaDestroy(&boxa); + tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix, + &vertical_x, &vertical_y, + NULL, &v_lines, &h_lines); + Pix* im_pix = tesseract::ImageFind::FindImages(pix); + if (im_pix != NULL) { + pixSubtract(pix, pix, im_pix); + pixDestroy(&im_pix); + } tess->mutable_textord()->find_components(tess->pix_binary(), blocks, to_blocks); } @@ -309,8 +353,7 @@ bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o, 0.0f, static_cast(kBlnBaselineOffset)); TBLOB* rotated_blob = new TBLOB(*tblob); rotated_blob->Normalize(denorm); - tess->set_denorm(&denorm); - tess->AdaptiveClassifier(rotated_blob, ratings + i, NULL); + tess->AdaptiveClassifier(rotated_blob, denorm, ratings + i, NULL); delete rotated_blob; current_rotation.rotate(rotation90); } @@ -452,7 +495,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) { // Workaround for Fraktur if (prev_id == latin_id_) { if (prev_fontinfo_id >= 0) { - const FontInfo &fi = + const tesseract::FontInfo &fi = tess_->get_fontinfo_table().get(prev_fontinfo_id); //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name, // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(), diff --git a/ccmain/osdetect.h b/ccmain/osdetect.h index f649b8a6a1..97bf599be7 100644 --- a/ccmain/osdetect.h +++ b/ccmain/osdetect.h @@ -45,7 +45,7 @@ struct OSBestResult { }; struct OSResults { - OSResults() { + OSResults() : unicharset(NULL) { for (int i = 0; i < 4; ++i) { for (int j = 0; j < kMaxNumberOfScripts; ++j) scripts_na[i][j] = 0; @@ -53,8 +53,19 @@ struct OSResults { } } void update_best_orientation(); + // Set the estimate of the orientation to the given id. void set_best_orientation(int orientation_id); + // Update/Compute the best estimate of the script assuming the given + // orientation id. void update_best_script(int orientation_id); + // Return the index of the script with the highest score for this orientation. + int get_best_script(int orientation_id) const; + // Accumulate scores with given OSResults instance and update the best script. + void accumulate(const OSResults& osr); + + // Print statistics. + void print_scores(void) const; + void print_scores(int orientation_id) const; // Array holding scores for each orientation id [0,3]. // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the diff --git a/ccmain/output.cpp b/ccmain/output.cpp index dce3d478cd..95ed7214e0 100644 --- a/ccmain/output.cpp +++ b/ccmain/output.cpp @@ -139,6 +139,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char newline_type, // type of newline BOOL8 force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); + const UNICHARSET &uchset = *word->uch_set; STRING repetition_code; const STRING *wordstr; STRING wordstr_lengths; @@ -150,7 +151,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char map_chs[32]; //Only for unlv_tilde_crunch int txt_index = 0; BOOL8 need_reject = FALSE; - UNICHAR_ID space = unicharset.unichar_to_id(" "); + UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { @@ -219,7 +220,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, txt_chs[txt_index] = '\0'; map_chs[txt_index] = '\0'; ep_chars[ep_chars_index] = '\0'; // terminate string - word->ep_choice = new WERD_CHOICE(ep_chars, unicharset); + word->ep_choice = new WERD_CHOICE(ep_chars, uchset); if (force_eol) stats_.write_results_empty_block = true; @@ -247,10 +248,9 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices()); if (!blob_choices_it.empty()) delete blob_choices_it.extract(); } - word->best_choice->populate_unichars(getDict().getUnicharset()); + word->best_choice->populate_unichars(); word->reject_map.remove_pos (0); - delete word->box_word; - word->box_word = new BoxWord; + word->box_word->DeleteBox(0); } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) @@ -273,14 +273,14 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", - word->best_choice->debug_string(unicharset).string(), + word->best_choice->debug_string().string(), dict_word(*(word->best_choice))); } if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; wordstr_lengths = "\001\001\001\001"; - repetition_code += unicharset.id_to_unichar(get_rep_char (word)); - wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word))); + repetition_code += uchset.id_to_unichar(get_rep_char(word)); + wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word))); wordstr = &repetition_code; } else { if (tessedit_zero_rejection) { @@ -355,7 +355,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? if (i < word->reject_map.length()) { return word->best_choice->unichar_id(i); } else { - return unicharset.unichar_to_id(unrecognised_char.string()); + return word->uch_set->unichar_to_id(unrecognised_char.string()); } } @@ -372,6 +372,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? void Tesseract::set_unlv_suspects(WERD_RES *word_res) { int len = word_res->reject_map.length(); const WERD_CHOICE &word = *(word_res->best_choice); + const UNICHARSET &uchset = *word.unicharset(); int i; float rating_per_ch; @@ -388,12 +389,12 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) { /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ - if (safe_dict_word(word) && + if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) { /* Unreject alphas in dictionary words */ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && - unicharset.get_isalpha(word.unichar_id(i))) + uchset.get_isalpha(word.unichar_id(i))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } @@ -407,7 +408,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && - (!unicharset.eq(word.unichar_id(i), " "))) + (!uchset.eq(word.unichar_id(i), " "))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } @@ -441,9 +442,10 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) { } } - if ((acceptable_word_string(word.unichar_string().string(), - word.unichar_lengths().string()) != - AC_UNACCEPTABLE) || + if (acceptable_word_string(*word_res->uch_set, + word.unichar_string().string(), + word.unichar_lengths().string()) != + AC_UNACCEPTABLE || acceptable_number_string(word.unichar_string().string(), word.unichar_lengths().string())) { if (word_res->reject_map.length() > suspect_short_words) { @@ -463,7 +465,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) { inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { int count = 0; for (int i = 0; i < word.length(); ++i) { - if (unicharset.get_isalpha(word.unichar_id(i))) + if (word.unicharset()->get_isalpha(word.unichar_id(i))) count++; } return count; @@ -473,8 +475,8 @@ inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) { int count = 0; for (int i = 0; i < word.length(); ++i) { - if (unicharset.get_isalpha(word.unichar_id(i)) || - unicharset.get_isdigit(word.unichar_id(i))) + if (word.unicharset()->get_isalpha(word.unichar_id(i)) || + word.unicharset()->get_isdigit(word.unichar_id(i))) count++; } return count; @@ -493,7 +495,7 @@ BOOL8 Tesseract::acceptable_number_string(const char *s, s++; for (; *s != '\0'; s += *(lengths++)) { - if (unicharset.get_isdigit (s, *lengths)) + if (unicharset.get_isdigit(s, *lengths)) prev_digit = TRUE; else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp index eae958cafc..10b98924c9 100644 --- a/ccmain/pagesegmain.cpp +++ b/ccmain/pagesegmain.cpp @@ -28,28 +28,27 @@ #pragma warning(disable:4244) // Conversion warnings #endif -#include - // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif #include "allheaders.h" -#include "tesseractclass.h" -#include "img.h" #include "blobbox.h" -#include "linefind.h" -#include "imagefind.h" -#include "colfind.h" -#include "tabvector.h" #include "blread.h" -#include "wordseg.h" +#include "colfind.h" +#include "equationdetect.h" +#include "imagefind.h" +#include "img.h" +#include "linefind.h" #include "makerow.h" #include "osdetect.h" +#include "tabvector.h" +#include "tesseractclass.h" +#include "tessvars.h" #include "textord.h" #include "tordmain.h" -#include "tessvars.h" +#include "wordseg.h" namespace tesseract { @@ -110,10 +109,6 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, ASSERT_HOST(pix_binary_ != NULL); int width = pixGetWidth(pix_binary_); int height = pixGetHeight(pix_binary_); - int resolution = pixGetXRes(pix_binary_); - // Zero resolution messes up the algorithms, so make sure it is credible. - if (resolution < kMinCredibleResolution) - resolution = kDefaultResolution; // Get page segmentation mode. PageSegMode pageseg_mode = static_cast( static_cast(tessedit_pageseg_mode)); @@ -145,7 +140,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, TO_BLOCK_LIST to_blocks; if (osd_enabled || PSM_BLOCK_FIND_ENABLED(pageseg_mode)) { auto_page_seg_ret_val = - AutoPageSeg(resolution, single_column, osd_enabled, osd_only, + AutoPageSeg(single_column, osd_enabled, osd_only, blocks, &to_blocks, osd_tess, osr); if (osd_only) return auto_page_seg_ret_val; @@ -175,29 +170,29 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, textord_.TextordPage(pageseg_mode, width, height, pix_binary_, blocks, &to_blocks); - SetupWordScripts(blocks); return auto_page_seg_ret_val; } -// TODO(rays) This is a hack to set all the words with a default script. -// In the future this will be set by a preliminary pass over the document. -void Tesseract::SetupWordScripts(BLOCK_LIST* blocks) { - int script = unicharset.default_sid(); - bool has_x_height = unicharset.script_has_xheight(); - bool is_latin = script == unicharset.latin_sid(); - BLOCK_IT b_it(blocks); - for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { - ROW_IT r_it(b_it.data()->row_list()); - for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { - WERD_IT w_it(r_it.data()->word_list()); - for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { - WERD* word = w_it.data(); - word->set_script_id(script); - word->set_flag(W_SCRIPT_HAS_XHEIGHT, has_x_height); - word->set_flag(W_SCRIPT_IS_LATIN, is_latin); - } - } +// Helper writes a grey image to a file for use by scrollviewer. +// Normally for speed we don't display the image in the layout debug windows. +// If textord_debug_images is true, we draw the image as a background to some +// of the debug windows. printable determines whether these +// images are optimized for printing instead of screen display. +static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { + Pix* grey_pix = pixCreate(pixGetWidth(pix_binary), + pixGetHeight(pix_binary), 8); + // Printable images are light grey on white, but for screen display + // they are black on dark grey so the other colors show up well. + if (printable) { + pixSetAll(grey_pix); + pixSetMasked(grey_pix, pix_binary, 192); + } else { + pixSetAllArbitrary(grey_pix, 64); + pixSetMasked(grey_pix, pix_binary, 0); } + AlignedBlob::IncrementDebugPix(); + pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG); + pixDestroy(&grey_pix); } @@ -214,119 +209,50 @@ void Tesseract::SetupWordScripts(BLOCK_LIST* blocks) { * into columns, but multiple blocks are still made if the text is of * non-uniform linespacing. * - * If osd is true, then orientation and script detection is performed as well. - * If only_osd is true, then only orientation and script detection is - * performed. If osr is desired, the osr_tess must be another Tesseract - * that was initialized especially for osd, and the results will be output - * into osr. + * If osd (orientation and script detection) is true then that is performed + * as well. If only_osd is true, then only orientation and script detection is + * performed. If osd is desired, (osd or only_osd) then osr_tess must be + * another Tesseract that was initialized especially for osd, and the results + * will be output into osr (orientation and script result). */ -int Tesseract::AutoPageSeg(int resolution, bool single_column, - bool osd, bool only_osd, +int Tesseract::AutoPageSeg(bool single_column, bool osd, bool only_osd, BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, Tesseract* osd_tess, OSResults* osr) { - int vertical_x = 0; - int vertical_y = 1; - TabVector_LIST v_lines; - TabVector_LIST h_lines; - ICOORD bleft(0, 0); - Boxa* boxa = NULL; - Pixa* pixa = NULL; + if (textord_debug_images) { + WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); + } + Pix* photomask_pix = NULL; + Pix* musicmask_pix = NULL; // The blocks made by the ColumnFinder. Moved to blocks before return. BLOCK_LIST found_blocks; + TO_BLOCK_LIST temp_blocks; - if (pix_binary_ != NULL) { - if (textord_debug_images) { - Pix* grey_pix = pixCreate(pixGetWidth(pix_binary_), - pixGetHeight(pix_binary_), 8); - // Printable images are light grey on white, but for screen display - // they are black on dark grey so the other colors show up well. - if (textord_debug_printable) { - pixSetAll(grey_pix); - pixSetMasked(grey_pix, pix_binary_, 192); - } else { - pixSetAllArbitrary(grey_pix, 64); - pixSetMasked(grey_pix, pix_binary_, 0); - } - AlignedBlob::IncrementDebugPix(); - pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG); - pixDestroy(&grey_pix); + ColumnFinder* finder = SetupPageSegAndDetectOrientation( + single_column, osd, only_osd, blocks, osd_tess, osr, + &temp_blocks, &photomask_pix, &musicmask_pix); + if (finder != NULL) { + TO_BLOCK_IT to_block_it(&temp_blocks); + TO_BLOCK* to_block = to_block_it.data(); + if (musicmask_pix != NULL) { + // TODO(rays) pass the musicmask_pix into FindBlocks and mark music + // blocks separately. For now combine with photomask_pix. + pixOr(photomask_pix, photomask_pix, musicmask_pix); } - if (tessedit_dump_pageseg_images) { - pixWrite("tessinput.png", pix_binary_, IFF_PNG); + if (equ_detect_) { + finder->SetEquationDetect(equ_detect_); } - // Leptonica is used to find the lines and image regions in the input. - LineFinder::FindVerticalLines(resolution, pix_binary_, - &vertical_x, &vertical_y, &v_lines); - LineFinder::FindHorizontalLines(resolution, pix_binary_, &h_lines); - if (tessedit_dump_pageseg_images) - pixWrite("tessnolines.png", pix_binary_, IFF_PNG); - ImageFinder::FindImages(pix_binary_, &boxa, &pixa); - if (tessedit_dump_pageseg_images) - pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); - if (single_column) - v_lines.clear(); - } - - TO_BLOCK_LIST port_blocks; - // The rest of the algorithm uses the usual connected components. - textord_.find_components(pix_binary_, blocks, &port_blocks); - - TO_BLOCK_IT to_block_it(&port_blocks); - ASSERT_HOST(!to_block_it.empty()); - for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list(); - to_block_it.forward()) { - TO_BLOCK* to_block = to_block_it.data(); - TBOX blkbox = to_block->block->bounding_box(); - if (to_block->line_size >= 2) { - // Note: if there are multiple blocks, then v_lines, boxa, and pixa - // are empty on the next iteration, but in this case, we assume - // that there aren't any interesting line separators or images, since - // it means that we have a pre-defined unlv zone file. - ColumnFinder finder(static_cast(to_block->line_size), - blkbox.botleft(), blkbox.topright(), resolution, - &v_lines, &h_lines, vertical_x, vertical_y); - BLOBNBOX_CLIST osd_blobs; - int osd_orientation = 0; - bool vertical_text = finder.IsVerticallyAlignedText(to_block, &osd_blobs); - if (osd && osd_tess != NULL && osr != NULL) { - os_detect_blobs(&osd_blobs, osr, osd_tess); - if (only_osd) continue; - osd_orientation = osr->best_result.orientation_id; - double osd_score = osr->orientations[osd_orientation]; - double osd_margin = min_orientation_margin * 2; - // tprintf("Orientation scores:"); - for (int i = 0; i < 4; ++i) { - if (i != osd_orientation && - osd_score - osr->orientations[i] < osd_margin) { - osd_margin = osd_score - osr->orientations[i]; - } - // tprintf(" %d:%f", i, osr->orientations[i]); - } - // tprintf("\n"); - if (osd_margin < min_orientation_margin) { - // Margin insufficient - dream up a suitable default. - if (vertical_text && (osd_orientation & 1)) - osd_orientation = 3; - else - osd_orientation = 0; - tprintf("Score margin insufficient:%.2f, using %d as a default\n", - osd_margin, osd_orientation); - } - } - osd_blobs.shallow_clear(); - finder.CorrectOrientation(to_block, vertical_text, osd_orientation); - if (finder.FindBlocks(single_column, pixGetHeight(pix_binary_), - to_block, boxa, pixa, &found_blocks, to_blocks) < 0) - return -1; - finder.GetDeskewVectors(&deskew_, &reskew_); - boxa = NULL; - pixa = NULL; + if (finder->FindBlocks(single_column, scaled_color_, scaled_factor_, + to_block, photomask_pix, + &found_blocks, to_blocks) < 0) { + pixDestroy(&photomask_pix); + pixDestroy(&musicmask_pix); + return -1; } + finder->GetDeskewVectors(&deskew_, &reskew_); + delete finder; } - boxaDestroy(&boxa); - pixaDestroy(&pixa); - if (only_osd) return 0; - + pixDestroy(&photomask_pix); + pixDestroy(&musicmask_pix); blocks->clear(); BLOCK_IT block_it(blocks); // Move the found blocks to the input/output blocks. @@ -339,4 +265,116 @@ int Tesseract::AutoPageSeg(int resolution, bool single_column, return 0; } +/** + * Sets up auto page segmentation, determines the orientation, and corrects it. + * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to + * facilitate testing. + * photo_mask_pix is a pointer to a NULL pointer that will be filled on return + * with the leptonica photo mask, which must be pixDestroyed by the caller. + * to_blocks is an empty list that will be filled with (usually a single) + * block that is used during layout analysis. This ugly API is required + * because of the possibility of a unlv zone file. + * TODO(rays) clean this up. + * See AutoPageSeg for other arguments. + * The returned ColumnFinder must be deleted after use. + */ +ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( + bool single_column, bool osd, bool only_osd, + BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, + TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) { + int vertical_x = 0; + int vertical_y = 1; + TabVector_LIST v_lines; + TabVector_LIST h_lines; + ICOORD bleft(0, 0); + + ASSERT_HOST(pix_binary_ != NULL); + if (tessedit_dump_pageseg_images) { + pixWrite("tessinput.png", pix_binary_, IFF_PNG); + } + // Leptonica is used to find the rule/separator lines in the input. + LineFinder::FindAndRemoveLines(source_resolution_, + textord_tabfind_show_vlines, pix_binary_, + &vertical_x, &vertical_y, music_mask_pix, + &v_lines, &h_lines); + if (tessedit_dump_pageseg_images) + pixWrite("tessnolines.png", pix_binary_, IFF_PNG); + // Leptonica is used to find a mask of the photo regions in the input. + *photo_mask_pix = ImageFind::FindImages(pix_binary_); + if (tessedit_dump_pageseg_images) + pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); + if (single_column) + v_lines.clear(); + + // The rest of the algorithm uses the usual connected components. + textord_.find_components(pix_binary_, blocks, to_blocks); + + TO_BLOCK_IT to_block_it(to_blocks); + // There must be exactly one input block. + // TODO(rays) handle new textline finding with a UNLV zone file. + ASSERT_HOST(to_blocks->singleton()); + TO_BLOCK* to_block = to_block_it.data(); + TBOX blkbox = to_block->block->bounding_box(); + ColumnFinder* finder = NULL; + + if (to_block->line_size >= 2) { + finder = new ColumnFinder(static_cast(to_block->line_size), + blkbox.botleft(), blkbox.topright(), + source_resolution_, + &v_lines, &h_lines, vertical_x, vertical_y); + + finder->SetupAndFilterNoise(*photo_mask_pix, to_block); + + if (equ_detect_) { + equ_detect_->LabelSpecialText(to_block); + } + + BLOBNBOX_CLIST osd_blobs; + // osd_orientation is the number of 90 degree rotations to make the + // characters upright. (See osdetect.h for precise definition.) + // We want the text lines horizontal, (vertical text indicates vertical + // textlines) which may conflict (eg vertically written CJK). + int osd_orientation = 0; + bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs); + if (osd && osd_tess != NULL && osr != NULL) { + os_detect_blobs(&osd_blobs, osr, osd_tess); + if (only_osd) { + delete finder; + return NULL; + } + osd_orientation = osr->best_result.orientation_id; + double osd_score = osr->orientations[osd_orientation]; + double osd_margin = min_orientation_margin * 2; + for (int i = 0; i < 4; ++i) { + if (i != osd_orientation && + osd_score - osr->orientations[i] < osd_margin) { + osd_margin = osd_score - osr->orientations[i]; + } + } + if (osd_margin < min_orientation_margin) { + // The margin is weak. + int best_script_id = osr->best_result.script_id; + bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) || + (best_script_id == osd_tess->unicharset.hiragana_sid()) || + (best_script_id == osd_tess->unicharset.katakana_sid()); + + if (!cjk && !vertical_text && osd_orientation == 2) { + // upside down latin text is improbable with such a weak margin. + tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " + "Don't rotate.\n", osd_margin); + osd_orientation = 0; + } else { + tprintf("OSD: Weak margin (%.2f) for %d blob text block, " + "but using orientation anyway: %d\n", + osd_blobs.length(), osd_margin, osd_orientation); + } + } + } + osd_blobs.shallow_clear(); + finder->CorrectOrientation(to_block, vertical_text, osd_orientation); + } + + return finder; +} + } // namespace tesseract. diff --git a/ccmain/paragraphs.cpp b/ccmain/paragraphs.cpp new file mode 100644 index 0000000000..856513087b --- /dev/null +++ b/ccmain/paragraphs.cpp @@ -0,0 +1,2444 @@ +/********************************************************************** + * File: paragraphs.cpp + * Description: Paragraph detection for tesseract. + * Author: David Eger + * Created: 25 February 2011 + * + * (C) Copyright 2011, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include + +#include "genericvector.h" +#include "helpers.h" +#include "mutableiterator.h" +#include "ocrpara.h" +#include "pageres.h" +#include "paragraphs.h" +#include "paragraphs_internal.h" +#include "publictypes.h" +#include "ratngs.h" +#include "rect.h" +#include "statistc.h" +#include "strngs.h" +#include "tprintf.h" +#include "unicharset.h" +#include "unicodes.h" + +namespace tesseract { + +// The tab vectors for a given line should be ignored if both its tab vectors +// are infrequent, specifically, if both tab vectors appear at most once per +// kStrayLinePer lines in a block. +const int kStrayLinePer = 6; + +// Special "weak" ParagraphModels. +const ParagraphModel *kCrownLeft + = reinterpret_cast(0xDEAD111F); +const ParagraphModel *kCrownRight + = reinterpret_cast(0xDEAD888F); + +// Given the width of a typical space between words, what is the threshold +// by which by which we think left and right alignments for paragraphs +// can vary and still be aligned. +static int Epsilon(int space_pix) { + return space_pix * 4 / 5; +} + +template +void SimpleSwap(T &a, T &b) { + T c = a; + a = b; + b = c; +} + +static bool AcceptableRowArgs( + int debug_level, int min_num_rows, const char *function_name, + const GenericVector *rows, + int row_start, int row_end) { + if (row_start < 0 || row_end > rows->size() || row_start > row_end) { + tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", + row_start, row_end, rows->size()); + return false; + } + if (row_end - row_start < min_num_rows) { + if (debug_level > 1) { + tprintf("# Too few rows[%d, %d) for %s.\n", + row_start, row_end, function_name); + } + return false; + } + return true; +} + +// =============================== Debug Code ================================ + +// Convert an integer to a decimal string. +static STRING StrOf(int num) { + char buffer[30]; + snprintf(buffer, sizeof(buffer), "%d", num); + return STRING(buffer); +} + +// Given a row-major matrix of unicode text and a column separator, print +// a formatted table. For ASCII, we get good column alignment. +static void PrintTable(const GenericVector > &rows, + const STRING &colsep) { + GenericVector max_col_widths; + for (int r = 0; r < rows.size(); r++) { + int num_columns = rows[r].size(); + for (int c = 0; c < num_columns; c++) { + int num_unicodes = 0; + for (int i = 0; i < rows[r][c].size(); i++) { + if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++; + } + if (c >= max_col_widths.size()) { + max_col_widths.push_back(num_unicodes); + } else { + if (num_unicodes > max_col_widths[c]) + max_col_widths[c] = num_unicodes; + } + } + } + + GenericVector col_width_patterns; + for (int c = 0; c < max_col_widths.size(); c++) { + col_width_patterns.push_back( + STRING("%-") + StrOf(max_col_widths[c]) + "s"); + } + + for (int r = 0; r < rows.size(); r++) { + for (int c = 0; c < rows[r].size(); c++) { + if (c > 0) + tprintf("%s", colsep.string()); + tprintf(col_width_patterns[c].string(), rows[r][c].string()); + } + tprintf("\n"); + } +} + +STRING RtlEmbed(const STRING &word, bool rtlify) { + if (rtlify) + return STRING(kRLE) + word + STRING(kPDF); + return word; +} + +// Print the current thoughts of the paragraph detector. +static void PrintDetectorState(const ParagraphTheory &theory, + const GenericVector &rows) { + GenericVector > output; + output.push_back(GenericVector()); + output.back().push_back("#row"); + output.back().push_back("space"); + output.back().push_back(".."); + output.back().push_back("lword[widthSEL]"); + output.back().push_back("rword[widthSEL]"); + RowScratchRegisters::AppendDebugHeaderFields(&output.back()); + output.back().push_back("text"); + + for (int i = 0; i < rows.size(); i++) { + output.push_back(GenericVector()); + GenericVector &row = output.back(); + const RowInfo& ri = *rows[i].ri_; + row.push_back(StrOf(i)); + row.push_back(StrOf(ri.average_interword_space)); + row.push_back(ri.has_leaders ? ".." : " "); + row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + + "[" + StrOf(ri.lword_box.width()) + + (ri.lword_likely_starts_idea ? "S" : "s") + + (ri.lword_likely_ends_idea ? "E" : "e") + + (ri.lword_indicates_list_item ? "L" : "l") + + "]"); + row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + + "[" + StrOf(ri.rword_box.width()) + + (ri.rword_likely_starts_idea ? "S" : "s") + + (ri.rword_likely_ends_idea ? "E" : "e") + + (ri.rword_indicates_list_item ? "L" : "l") + + "]"); + rows[i].AppendDebugInfo(theory, &row); + row.push_back(RtlEmbed(ri.text, !ri.ltr)); + } + PrintTable(output, " "); + + tprintf("Active Paragraph Models:\n"); + for (int m = 0; m < theory.models().size(); m++) { + tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string()); + } +} + +static void DebugDump( + bool should_print, + const STRING &phase, + const ParagraphTheory &theory, + const GenericVector &rows) { + if (!should_print) + return; + tprintf("# %s\n", phase.string()); + PrintDetectorState(theory, rows); +} + +// Print out the text for rows[row_start, row_end) +static void PrintRowRange(const GenericVector &rows, + int row_start, int row_end) { + tprintf("======================================\n"); + for (int row = row_start; row < row_end; row++) { + tprintf("%s\n", rows[row].ri_->text.string()); + } + tprintf("======================================\n"); +} + +// ============= Brain Dead Language Model (ASCII Version) =================== + +bool IsLatinLetter(int ch) { + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); +} + +bool IsDigitLike(int ch) { + return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I'; +} + +bool IsOpeningPunct(int ch) { + return strchr("'\"({[", ch) != NULL; +} + +bool IsTerminalPunct(int ch) { + return strchr(":'\".?!]})", ch) != NULL; +} + +// Return a pointer after consuming as much text as qualifies as roman numeral. +const char *SkipChars(const char *str, const char *toskip) { + while (*str != '\0' && strchr(toskip, *str)) { str++; } + return str; +} + +const char *SkipChars(const char *str, bool (*skip)(int)) { + while (*str != '\0' && skip(*str)) { str++; } + return str; +} + +const char *SkipOne(const char *str, const char *toskip) { + if (*str != '\0' && strchr(toskip, *str)) return str + 1; + return str; +} + +// Return whether it is very likely that this is a numeral marker that could +// start a list item. Some examples include: +// A I iii. VI (2) 3.5. [C-4] +bool LikelyListNumeral(const STRING &word) { + const char *kRomans = "ivxlmdIVXLMD"; + const char *kDigits = "012345789"; + const char *kOpen = "[{("; + const char *kSep = ":;-.,"; + const char *kClose = "]})"; + + int num_segments = 0; + const char *pos = word.string(); + while (*pos != '\0' && num_segments < 3) { + // skip up to two open parens. + const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen); + const char *numeral_end = SkipChars(numeral_start, kRomans); + if (numeral_end != numeral_start) { + // Got Roman Numeral. Great. + } else { + numeral_end = SkipChars(numeral_start, kDigits); + if (numeral_end == numeral_start) { + // If there's a single latin letter, we can use that. + numeral_end = SkipChars(numeral_start, IsLatinLetter); + if (numeral_end - numeral_start != 1) + break; + } + } + // We got some sort of numeral. + num_segments++; + // Skip any trailing parens or punctuation. + pos = SkipChars(SkipChars(numeral_end, kClose), kSep); + if (pos == numeral_end) + break; + } + return *pos == '\0'; +} + +bool LikelyListMark(const STRING &word) { + const char *kListMarks = "0Oo*.,+."; + return word.size() == 1 && strchr(kListMarks, word[0]) != NULL; +} + +bool AsciiLikelyListItem(const STRING &word) { + return LikelyListMark(word) || LikelyListNumeral(word); +} + +// ========== Brain Dead Language Model (Tesseract Version) ================ + +// Return the first Unicode Codepoint from werd[pos]. +int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) { + if (!u || !werd || pos > werd->length()) + return 0; + return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni(); +} + +// A useful helper class for finding the first j >= i so that word[j] +// does not have given character type. +class UnicodeSpanSkipper { + public: + UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) + : u_(unicharset), word_(word) { wordlen_ = word->length(); } + + // Given an input position, return the first position >= pos not punc. + int SkipPunc(int pos); + // Given an input position, return the first position >= pos not digit. + int SkipDigits(int pos); + // Given an input position, return the first position >= pos not roman. + int SkipRomans(int pos); + // Given an input position, return the first position >= pos not alpha. + int SkipAlpha(int pos); + + private: + const UNICHARSET *u_; + const WERD_CHOICE *word_; + int wordlen_; +}; + +int UnicodeSpanSkipper::SkipPunc(int pos) { + while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++; + return pos; +} + +int UnicodeSpanSkipper::SkipDigits(int pos) { + while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) || + IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++; + return pos; +} + +int UnicodeSpanSkipper::SkipRomans(int pos) { + const char *kRomans = "ivxlmdIVXLMD"; + while (pos < wordlen_) { + int ch = UnicodeFor(u_, word_, pos); + if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break; + pos++; + } + return pos; +} + +int UnicodeSpanSkipper::SkipAlpha(int pos) { + while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++; + return pos; +} + +bool LikelyListMarkUnicode(int ch) { + if (ch < 0x80) { + STRING single_ch; + single_ch += ch; + return LikelyListMark(single_ch); + } + switch (ch) { + // TODO(eger) expand this list of unicodes as needed. + case 0x00B0: // degree sign + case 0x2022: // bullet + case 0x25E6: // white bullet + case 0x00B7: // middle dot + case 0x25A1: // white square + case 0x25A0: // black square + case 0x25AA: // black small square + case 0x2B1D: // black very small square + case 0x25BA: // black right-pointing pointer + case 0x25CF: // black circle + case 0x25CB: // white circle + return true; + default: + break; // fall through + } + return false; +} + +// Return whether it is very likely that this is a numeral marker that could +// start a list item. Some examples include: +// A I iii. VI (2) 3.5. [C-4] +bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { + if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) + return true; + + UnicodeSpanSkipper m(u, werd); + int num_segments = 0; + int pos = 0; + while (pos < werd->length() && num_segments < 3) { + int numeral_start = m.SkipPunc(pos); + if (numeral_start > pos + 1) break; + int numeral_end = m.SkipRomans(numeral_start); + if (numeral_end == numeral_start) { + numeral_end = m.SkipDigits(numeral_start); + if (numeral_end == numeral_start) { + // If there's a single latin letter, we can use that. + numeral_end = m.SkipAlpha(numeral_start); + if (numeral_end - numeral_start != 1) + break; + } + } + // We got some sort of numeral. + num_segments++; + // Skip any trailing punctuation. + pos = m.SkipPunc(numeral_end); + if (pos == numeral_end) + break; + } + return pos == werd->length(); +} + +// ========= Brain Dead Language Model (combined entry points) ================ + +// Given the leftmost word of a line either as a Tesseract unicharset + werd +// or a utf8 string, set the following attributes for it: +// is_list - this word might be a list number or bullet. +// starts_idea - this word is likely to start a sentence. +// ends_idea - this word is likely to end a sentence. +void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, + const STRING &utf8, + bool *is_list, bool *starts_idea, bool *ends_idea) { + *is_list = false; + *starts_idea = false; + *ends_idea = false; + if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty + *ends_idea = true; + return; + } + + if (unicharset && werd) { // We have a proper werd and unicharset so use it. + if (UniLikelyListItem(unicharset, werd)) { + *is_list = true; + *starts_idea = true; + *ends_idea = true; + } + if (unicharset->get_isupper(werd->unichar_id(0))) { + *starts_idea = true; + } + if (unicharset->get_ispunctuation(werd->unichar_id(0))) { + *starts_idea = true; + *ends_idea = true; + } + } else { // Assume utf8 is mostly ASCII + if (AsciiLikelyListItem(utf8)) { + *is_list = true; + *starts_idea = true; + } + int start_letter = utf8[0]; + if (IsOpeningPunct(start_letter)) { + *starts_idea = true; + } + if (IsTerminalPunct(start_letter)) { + *ends_idea = true; + } + if (start_letter >= 'A' && start_letter <= 'Z') { + *starts_idea = true; + } + } +} + +// Given the rightmost word of a line either as a Tesseract unicharset + werd +// or a utf8 string, set the following attributes for it: +// is_list - this word might be a list number or bullet. +// starts_idea - this word is likely to start a sentence. +// ends_idea - this word is likely to end a sentence. +void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, + const STRING &utf8, + bool *is_list, bool *starts_idea, bool *ends_idea) { + *is_list = false; + *starts_idea = false; + *ends_idea = false; + if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty + *ends_idea = true; + return; + } + + if (unicharset && werd) { // We have a proper werd and unicharset so use it. + if (UniLikelyListItem(unicharset, werd)) { + *is_list = true; + *starts_idea = true; + } + UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1); + if (unicharset->get_ispunctuation(last_letter)) { + *ends_idea = true; + } + } else { // Assume utf8 is mostly ASCII + if (AsciiLikelyListItem(utf8)) { + *is_list = true; + *starts_idea = true; + } + int last_letter = utf8[utf8.size() - 1]; + if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { + *ends_idea = true; + } + } +} + +// =============== Implementation of RowScratchRegisters ===================== +/* static */ +void RowScratchRegisters::AppendDebugHeaderFields( + GenericVector *header) { + header->push_back("[lmarg,lind;rind,rmarg]"); + header->push_back("model"); +} + +void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory, + GenericVector *dbg) const { + char s[30]; + snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", + lmargin_, lindent_, rindent_, rmargin_); + dbg->push_back(s); + STRING model_string; + model_string += static_cast(GetLineType()); + model_string += ":"; + + int model_numbers = 0; + for (int h = 0; h < hypotheses_.size(); h++) { + if (hypotheses_[h].model == NULL) + continue; + if (model_numbers > 0) + model_string += ","; + if (StrongModel(hypotheses_[h].model)) { + model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model)); + } else if (hypotheses_[h].model == kCrownLeft) { + model_string += "CrL"; + } else if (hypotheses_[h].model == kCrownRight) { + model_string += "CrR"; + } + model_numbers++; + } + if (model_numbers == 0) + model_string += "0"; + + dbg->push_back(model_string); +} + +void RowScratchRegisters::Init(const RowInfo &row) { + ri_ = &row; + lmargin_ = 0; + lindent_ = row.pix_ldistance; + rmargin_ = 0; + rindent_ = row.pix_rdistance; +} + +LineType RowScratchRegisters::GetLineType() const { + if (hypotheses_.empty()) + return LT_UNKNOWN; + bool has_start = false; + bool has_body = false; + for (int i = 0; i < hypotheses_.size(); i++) { + switch (hypotheses_[i].ty) { + case LT_START: has_start = true; break; + case LT_BODY: has_body = true; break; + default: + tprintf("Encountered bad value in hypothesis list: %c\n", + hypotheses_[i].ty); + break; + } + } + if (has_start && has_body) + return LT_MULTIPLE; + return has_start ? LT_START : LT_BODY; +} + +LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { + if (hypotheses_.empty()) + return LT_UNKNOWN; + bool has_start = false; + bool has_body = false; + for (int i = 0; i < hypotheses_.size(); i++) { + if (hypotheses_[i].model != model) + continue; + switch (hypotheses_[i].ty) { + case LT_START: has_start = true; break; + case LT_BODY: has_body = true; break; + default: + tprintf("Encountered bad value in hypothesis list: %c\n", + hypotheses_[i].ty); + break; + } + } + if (has_start && has_body) + return LT_MULTIPLE; + return has_start ? LT_START : LT_BODY; +} + +void RowScratchRegisters::SetStartLine() { + LineType current_lt = GetLineType(); + if (current_lt != LT_UNKNOWN && current_lt != LT_START) { + tprintf("Trying to set a line to be START when it's already BODY.\n"); + } + if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { + hypotheses_.push_back_new(LineHypothesis(LT_START, NULL)); + } +} + +void RowScratchRegisters::SetBodyLine() { + LineType current_lt = GetLineType(); + if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { + tprintf("Trying to set a line to be BODY when it's already START.\n"); + } + if (current_lt == LT_UNKNOWN || current_lt == LT_START) { + hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL)); + } +} + +void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { + hypotheses_.push_back_new(LineHypothesis(LT_START, model)); + int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL)); + if (old_idx >= 0) + hypotheses_.remove(old_idx); +} + +void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { + hypotheses_.push_back_new(LineHypothesis(LT_BODY, model)); + int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL)); + if (old_idx >= 0) + hypotheses_.remove(old_idx); +} + +void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { + for (int h = 0; h < hypotheses_.size(); h++) { + if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model)) + models->push_back_new(hypotheses_[h].model); + } +} + +void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { + for (int h = 0; h < hypotheses_.size(); h++) { + if (StrongModel(hypotheses_[h].model)) + models->push_back_new(hypotheses_[h].model); + } +} + +void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { + for (int h = 0; h < hypotheses_.size(); h++) { + if (hypotheses_[h].model != NULL) + models->push_back_new(hypotheses_[h].model); + } +} + +const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { + if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) + return NULL; + return hypotheses_[0].model; +} + +const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { + if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) + return NULL; + return hypotheses_[0].model; +} + +// Discard any hypotheses whose model is not in the given list. +void RowScratchRegisters::DiscardNonMatchingHypotheses( + const SetOfModels &models) { + if (models.empty()) + return; + for (int h = hypotheses_.size() - 1; h >= 0; h--) { + if (!models.contains(hypotheses_[h].model)) { + hypotheses_.remove(h); + } + } +} + +// ============ Geometry based Paragraph Detection Algorithm ================= + +struct Cluster { + Cluster() : center(0), count(0) {} + Cluster(int cen, int num) : center(cen), count(num) {} + + int center; // The center of the cluster. + int count; // The number of entries within the cluster. +}; + +class SimpleClusterer { + public: + explicit SimpleClusterer(int max_cluster_width) + : max_cluster_width_(max_cluster_width) {} + void Add(int value) { values_.push_back(value); } + int size() const { return values_.size(); } + void GetClusters(GenericVector *clusters); + + private: + int max_cluster_width_; + GenericVectorEqEq values_; +}; + +// Return the index of the cluster closest to value. +int ClosestCluster(const GenericVector &clusters, int value) { + int best_index = 0; + for (int i = 0; i < clusters.size(); i++) { + if (abs(value - clusters[i].center) < + abs(value - clusters[best_index].center)) + best_index = i; + } + return best_index; +} + +void SimpleClusterer::GetClusters(GenericVector *clusters) { + clusters->clear(); + values_.sort(); + for (int i = 0; i < values_.size();) { + int orig_i = i; + int lo = values_[i]; + int hi = lo; + while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { + hi = values_[i]; + } + clusters->push_back(Cluster((hi + lo) / 2, i - orig_i)); + } +} + +// Calculate left- and right-indent tab stop values seen in +// rows[row_start, row_end) given a tolerance of tolerance. +void CalculateTabStops(GenericVector *rows, + int row_start, int row_end, + int tolerance, + GenericVector *left_tabs, + GenericVector *right_tabs) { + if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) + return; + // First pass: toss all left and right indents into clusterers. + SimpleClusterer initial_lefts(tolerance); + SimpleClusterer initial_rights(tolerance); + GenericVector initial_left_tabs; + GenericVector initial_right_tabs; + for (int i = row_start; i < row_end; i++) { + initial_lefts.Add((*rows)[i].lindent_); + initial_rights.Add((*rows)[i].rindent_); + } + initial_lefts.GetClusters(&initial_left_tabs); + initial_rights.GetClusters(&initial_right_tabs); + + // Second pass: cluster only lines that are not "stray" + // An example of a stray line is a page number -- a line whose start + // and end tab-stops are far outside the typical start and end tab-stops + // for the block. + // Put another way, we only cluster data from lines whose start or end + // tab stop is frequent. + SimpleClusterer lefts(tolerance); + SimpleClusterer rights(tolerance); + int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer; + for (int i = row_start; i < row_end; i++) { + int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); + int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); + if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore || + initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { + lefts.Add((*rows)[i].lindent_); + rights.Add((*rows)[i].rindent_); + } + } + lefts.GetClusters(left_tabs); + rights.GetClusters(right_tabs); +} + +// Given a paragraph model mark rows[row_start, row_end) as said model +// start or body lines. +// +// Case 1: model->first_indent_ != model->body_indent_ +// Differentiating the paragraph start lines from the paragraph body lines in +// this case is easy, we just see how far each line is indented. +// +// Case 2: model->first_indent_ == model->body_indent_ +// Here, we find end-of-paragraph lines by looking for "short lines." +// What constitutes a "short line" changes depending on whether the text +// ragged-right[left] or fully justified (aligned left and right). +// +// Case 2a: Ragged Right (or Left) text. (eop_threshold == 0) +// We have a new paragraph it the first word would have at the end +// of the previous line. +// +// Case 2b: Fully Justified. (eop_threshold > 0) +// We mark a line as short (end of paragraph) if the offside indent +// is greater than eop_threshold. +void MarkRowsWithModel(GenericVector *rows, + int row_start, int row_end, + const ParagraphModel *model, + bool ltr, + int eop_threshold) { + if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) + return; + for (int row = row_start; row < row_end; row++) { + bool valid_first = ValidFirstLine(rows, row, model); + bool valid_body = ValidBodyLine(rows, row, model); + if (valid_first && !valid_body) { + (*rows)[row].AddStartLine(model); + } else if (valid_body && !valid_first) { + (*rows)[row].AddBodyLine(model); + } else if (valid_body && valid_first) { + bool after_eop = (row == row_start); + if (row > row_start) { + if (eop_threshold > 0) { + if (model->justification() == JUSTIFICATION_LEFT) { + after_eop = (*rows)[row - 1].rindent_ > eop_threshold; + } else { + after_eop = (*rows)[row - 1].lindent_ > eop_threshold; + } + } else { + after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], + model->justification()); + } + } + if (after_eop) { + (*rows)[row].AddStartLine(model); + } else { + (*rows)[row].AddBodyLine(model); + } + } else { + // Do nothing. Stray row. + } + } +} + +// GeometricClassifierState holds all of the information we'll use while +// trying to determine a paragraph model for the text lines in a block of +// text: +// + the rows under consideration [row_start, row_end) +// + the common left- and right-indent tab stops +// + does the block start out left-to-right or right-to-left +// Further, this struct holds the data we amass for the (single) ParagraphModel +// we'll assign to the text lines (assuming we get that far). +struct GeometricClassifierState { + GeometricClassifierState(int dbg_level, + GenericVector *r, + int r_start, int r_end) + : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end), + margin(0) { + tolerance = InterwordSpace(*r, r_start, r_end); + CalculateTabStops(r, r_start, r_end, tolerance, + &left_tabs, &right_tabs); + ltr = (*r)[r_start].ri_->ltr; + } + + void AssumeLeftJustification() { + just = tesseract::JUSTIFICATION_LEFT; + margin = (*rows)[row_start].lmargin_; + } + + void AssumeRightJustification() { + just = tesseract::JUSTIFICATION_RIGHT; + margin = (*rows)[row_start].rmargin_; + } + + // Align tabs are the tab stops the text is aligned to. + const GenericVector &AlignTabs() const { + if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs; + return left_tabs; + } + + // Offside tabs are the tab stops opposite the tabs used to align the text. + // + // Note that for a left-to-right text which is aligned to the right such as + // this function comment, the offside tabs are the horizontal tab stops + // marking the beginning of ("Note", "this" and "marking"). + const GenericVector &OffsideTabs() const { + if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs; + return right_tabs; + } + + // Return whether the i'th row extends from the leftmost left tab stop + // to the right most right tab stop. + bool IsFullRow(int i) const { + return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 && + ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0; + } + + int AlignsideTabIndex(int row_idx) const { + return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just)); + } + + // Given what we know about the paragraph justification (just), would the + // first word of row_b have fit at the end of row_a? + bool FirstWordWouldHaveFit(int row_a, int row_b) { + return ::tesseract::FirstWordWouldHaveFit( + (*rows)[row_a], (*rows)[row_b], just); + } + + void PrintRows() const { PrintRowRange(*rows, row_start, row_end); } + + void Fail(int min_debug_level, const char *why) const { + if (debug_level < min_debug_level) return; + tprintf("# %s\n", why); + PrintRows(); + } + + ParagraphModel Model() const { + return ParagraphModel(just, margin, first_indent, body_indent, tolerance); + } + + // We print out messages with a debug level at least as great as debug_level. + int debug_level; + + // The Geometric Classifier was asked to find a single paragraph model + // to fit the text rows (*rows)[row_start, row_end) + GenericVector *rows; + int row_start; + int row_end; + + // The amount by which we expect the text edge can vary and still be aligned. + int tolerance; + + // Is the script in this text block left-to-right? + // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve + bool ltr; + + // These left and right tab stops were determined to be the common tab + // stops for the given text. + GenericVector left_tabs; + GenericVector right_tabs; + + // These are parameters we must determine to create a ParagraphModel. + tesseract::ParagraphJustification just; + int margin; + int first_indent; + int body_indent; + + // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel() + int eop_threshold; +}; + +// Given a section of text where strong textual clues did not help identifying +// paragraph breaks, and for which the left and right indents have exactly +// three tab stops between them, attempt to find the paragraph breaks based +// solely on the outline of the text and whether the script is left-to-right. +// +// Algorithm Detail: +// The selected rows are in the form of a rectangle except +// for some number of "short lines" of the same length: +// +// (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx +// xxxxxxxxxxx xxxxxxxxxx # A "short" line. +// xxxxxxxxxxxxx xxxxxxxxxxxx +// xxxxxxxxxxxxx xxxxxxxxxxxx +// +// We have a slightly different situation if the only short +// line is at the end of the excerpt. +// +// (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx +// xxxxxxxxxxxxx xxxxxxxxxxxx +// xxxxxxxxxxxxx xxxxxxxxxxxx +// xxxxxxxxxxx xxxxxxxxxx # A "short" line. +// +// We'll interpret these as follows based on the reasoning in the comment for +// GeometricClassify(): +// [script direction: first indent, body indent] +// (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0 +// (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0 +void GeometricClassifyThreeTabStopTextBlock( + int debug_level, + GeometricClassifierState &s, + ParagraphTheory *theory) { + int num_rows = s.row_end - s.row_start; + int num_full_rows = 0; + int last_row_full = 0; + for (int i = s.row_start; i < s.row_end; i++) { + if (s.IsFullRow(i)) { + num_full_rows++; + if (i == s.row_end - 1) last_row_full++; + } + } + + if (num_full_rows < 0.7 * num_rows) { + s.Fail(1, "Not enough full lines to know which lines start paras."); + return; + } + + // eop_threshold gets set if we're fully justified; see MarkRowsWithModel() + s.eop_threshold = 0; + + if (s.ltr) { + s.AssumeLeftJustification(); + } else { + s.AssumeRightJustification(); + } + + if (debug_level > 0) { + tprintf("# Not enough variety for clear outline classification. " + "Guessing these are %s aligned based on script.\n", + s.ltr ? "left" : "right"); + s.PrintRows(); + } + + if (s.AlignTabs().size() == 2) { // case A1 or A2 + s.first_indent = s.AlignTabs()[1].center; + s.body_indent = s.AlignTabs()[0].center; + } else { // case B1 or B2 + if (num_rows - 1 == num_full_rows - last_row_full) { + // case B2 + const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight; + (*s.rows)[s.row_start].AddStartLine(model); + for (int i = s.row_start + 1; i < s.row_end; i++) { + (*s.rows)[i].AddBodyLine(model); + } + return; + } else { + // case B1 + s.first_indent = s.body_indent = s.AlignTabs()[0].center; + s.eop_threshold = (s.OffsideTabs()[0].center + + s.OffsideTabs()[1].center) / 2; + } + } + const ParagraphModel *model = theory->AddModel(s.Model()); + MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, + s.ltr, s.eop_threshold); + return; +} + +// This function is called if strong textual clues were not available, but +// the caller hopes that the paragraph breaks will be super obvious just +// by the outline of the text. +// +// The particularly difficult case is figuring out what's going on if you +// don't have enough short paragraph end lines to tell us what's going on. +// +// For instance, let's say you have the following outline: +// +// (A1) xxxxxxxxxxxxxxxxxxxxxx +// xxxxxxxxxxxxxxxxxxxx +// xxxxxxxxxxxxxxxxxxxxxx +// xxxxxxxxxxxxxxxxxxxxxx +// +// Even if we know that the text is left-to-right and so will probably be +// left-aligned, both of the following are possible texts: +// +// (A1a) 1. Here our list item +// with two full lines. +// 2. Here a second item. +// 3. Here our third one. +// +// (A1b) so ends paragraph one. +// Here starts another +// paragraph we want to +// read. This continues +// +// These examples are obvious from the text and should have been caught +// by the StrongEvidenceClassify pass. However, for languages where we don't +// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), +// it's worth guessing that (A1b) is the correct interpretation if there are +// far more "full" lines than "short" lines. +void GeometricClassify(int debug_level, + GenericVector *rows, + int row_start, int row_end, + ParagraphTheory *theory) { + if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) + return; + if (debug_level > 1) { + tprintf("###############################################\n"); + tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", + row_start, row_end); + tprintf("###############################################\n"); + } + RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); + + GeometricClassifierState s(debug_level, rows, row_start, row_end); + if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { + s.Fail(2, "Too much variety for simple outline classification."); + return; + } + if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { + s.Fail(1, "Not enough variety for simple outline classification."); + return; + } + if (s.left_tabs.size() + s.right_tabs.size() == 3) { + GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory); + return; + } + + // At this point, we know that one side has at least two tab stops, and the + // other side has one or two tab stops. + // Left to determine: + // (1) Which is the body indent and which is the first line indent? + // (2) Is the text fully justified? + + // If one side happens to have three or more tab stops, assume that side + // is opposite of the aligned side. + if (s.right_tabs.size() > 2) { + s.AssumeLeftJustification(); + } else if (s.left_tabs.size() > 2) { + s.AssumeRightJustification(); + } else if (s.ltr) { // guess based on script direction + s.AssumeLeftJustification(); + } else { + s.AssumeRightJustification(); + } + + if (s.AlignTabs().size() == 2) { + // For each tab stop on the aligned side, how many of them appear + // to be paragraph start lines? [first lines] + int firsts[2] = {0, 0}; + // Count the first line as a likely paragraph start line. + firsts[s.AlignsideTabIndex(s.row_start)]++; + // For each line, if the first word would have fit on the previous + // line count it as a likely paragraph start line. + for (int i = s.row_start + 1; i < s.row_end; i++) { + if (s.FirstWordWouldHaveFit(i - 1, i)) { + firsts[s.AlignsideTabIndex(i)]++; + } + } + // Make an extra accounting for the last line of the paragraph just + // in case it's the only short line in the block. That is, take its + // first word as typical and see if this looks like the *last* line + // of a paragraph. If so, mark the *other* indent as probably a first. + if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { + firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++; + } + + int percent0firsts, percent1firsts; + percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count; + percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count; + + // TODO(eger): Tune these constants if necessary. + if ((percent0firsts < 20 && 30 < percent1firsts) || + percent0firsts + 30 < percent1firsts) { + s.first_indent = s.AlignTabs()[1].center; + s.body_indent = s.AlignTabs()[0].center; + } else if ((percent1firsts < 20 && 30 < percent0firsts) || + percent1firsts + 30 < percent0firsts) { + s.first_indent = s.AlignTabs()[0].center; + s.body_indent = s.AlignTabs()[1].center; + } else { + // Ambiguous! Probably lineated (poetry) + if (debug_level > 1) { + tprintf("# Cannot determine %s indent likely to start paragraphs.\n", + s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right"); + tprintf("# Indent of %d looks like a first line %d%% of the time.\n", + s.AlignTabs()[0].center, percent0firsts); + tprintf("# Indent of %d looks like a first line %d%% of the time.\n", + s.AlignTabs()[1].center, percent1firsts); + s.PrintRows(); + } + return; + } + } else { + // There's only one tab stop for the "aligned to" side. + s.first_indent = s.body_indent = s.AlignTabs()[0].center; + } + + // At this point, we have our model. + const ParagraphModel *model = theory->AddModel(s.Model()); + + // Now all we have to do is figure out if the text is fully justified or not. + // eop_threshold: default to fully justified unless we see evidence below. + // See description on MarkRowsWithModel() + s.eop_threshold = + (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; + // If the text is not fully justified, re-set the eop_threshold to 0. + if (s.AlignTabs().size() == 2) { + // Paragraphs with a paragraph-start indent. + for (int i = s.row_start; i < s.row_end - 1; i++) { + if (ValidFirstLine(s.rows, i + 1, model) && + !NearlyEqual(s.OffsideTabs()[0].center, + (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { + // We found a non-end-of-paragraph short line: not fully justified. + s.eop_threshold = 0; + break; + } + } + } else { + // Paragraphs with no paragraph-start indent. + for (int i = s.row_start; i < s.row_end - 1; i++) { + if (!s.FirstWordWouldHaveFit(i, i + 1) && + !NearlyEqual(s.OffsideTabs()[0].center, + (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { + // We found a non-end-of-paragraph short line: not fully justified. + s.eop_threshold = 0; + break; + } + } + } + MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold); +} + +// =============== Implementation of ParagraphTheory ===================== + +const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { + for (int i = 0; i < models_->size(); i++) { + if ((*models_)[i]->Comparable(model)) + return (*models_)[i]; + } + ParagraphModel *m = new ParagraphModel(model); + models_->push_back(m); + models_we_added_.push_back_new(m); + return m; +} + +void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { + for (int i = models_->size() - 1; i >= 0; i--) { + ParagraphModel *m = (*models_)[i]; + if (!used_models.contains(m) && models_we_added_.contains(m)) { + delete m; + models_->remove(i); + models_we_added_.remove(models_we_added_.get_index(m)); + } + } +} + +// Examine rows[start, end) and try to determine if an existing non-centered +// paragraph model would fit them perfectly. If so, return a pointer to it. +// If not, return NULL. +const ParagraphModel *ParagraphTheory::Fits( + const GenericVector *rows, int start, int end) const { + for (int m = 0; m < models_->size(); m++) { + const ParagraphModel *model = (*models_)[m]; + if (model->justification() != JUSTIFICATION_CENTER && + RowsFitModel(rows, start, end, model)) + return model; + } + return NULL; +} + +void ParagraphTheory::NonCenteredModels(SetOfModels *models) { + for (int m = 0; m < models_->size(); m++) { + const ParagraphModel *model = (*models_)[m]; + if (model->justification() != JUSTIFICATION_CENTER) + models->push_back_new(model); + } +} + +int ParagraphTheory::IndexOf(const ParagraphModel *model) const { + for (int i = 0; i < models_->size(); i++) { + if ((*models_)[i] == model) + return i; + } + return -1; +} + +bool ValidFirstLine(const GenericVector *rows, + int row, const ParagraphModel *model) { + if (!StrongModel(model)) { + tprintf("ValidFirstLine() should only be called with strong models!\n"); + } + return StrongModel(model) && + model->ValidFirstLine( + (*rows)[row].lmargin_, (*rows)[row].lindent_, + (*rows)[row].rindent_, (*rows)[row].rmargin_); +} + +bool ValidBodyLine(const GenericVector *rows, + int row, const ParagraphModel *model) { + if (!StrongModel(model)) { + tprintf("ValidBodyLine() should only be called with strong models!\n"); + } + return StrongModel(model) && + model->ValidBodyLine( + (*rows)[row].lmargin_, (*rows)[row].lindent_, + (*rows)[row].rindent_, (*rows)[row].rmargin_); +} + +bool CrownCompatible(const GenericVector *rows, + int a, int b, const ParagraphModel *model) { + if (model != kCrownRight && model != kCrownLeft) { + tprintf("CrownCompatible() should only be called with crown models!\n"); + return false; + } + RowScratchRegisters &row_a = (*rows)[a]; + RowScratchRegisters &row_b = (*rows)[b]; + if (model == kCrownRight) { + return NearlyEqual(row_a.rindent_ + row_a.rmargin_, + row_b.rindent_ + row_b.rmargin_, + Epsilon(row_a.ri_->average_interword_space)); + } + return NearlyEqual(row_a.lindent_ + row_a.lmargin_, + row_b.lindent_ + row_b.lmargin_, + Epsilon(row_a.ri_->average_interword_space)); +} + + +// =============== Implementation of ParagraphModelSmearer ==================== + +ParagraphModelSmearer::ParagraphModelSmearer( + GenericVector *rows, + int row_start, int row_end, ParagraphTheory *theory) + : theory_(theory), rows_(rows), row_start_(row_start), + row_end_(row_end) { + if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { + row_start_ = 0; + row_end_ = 0; + return; + } + SetOfModels no_models; + for (int row = row_start - 1; row <= row_end; row++) { + open_models_.push_back(no_models); + } +} + +// see paragraphs_internal.h +void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { + SetOfModels no_models; + if (row_start < row_start_) row_start = row_start_; + if (row_end > row_end_) row_end = row_end_; + + for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; + row++) { + if ((*rows_)[row].ri_->num_words == 0) { + OpenModels(row + 1) = no_models; + } else { + SetOfModels &opened = OpenModels(row); + (*rows_)[row].StartHypotheses(&opened); + + // Which models survive the transition from row to row + 1? + SetOfModels still_open; + for (int m = 0; m < opened.size(); m++) { + if (ValidFirstLine(rows_, row, opened[m]) || + ValidBodyLine(rows_, row, opened[m])) { + // This is basic filtering; we check likely paragraph starty-ness down + // below in Smear() -- you know, whether the first word would have fit + // and such. + still_open.push_back_new(opened[m]); + } + } + OpenModels(row + 1) = still_open; + } + } +} + +// see paragraphs_internal.h +void ParagraphModelSmearer::Smear() { + CalculateOpenModels(row_start_, row_end_); + + // For each row which we're unsure about (that is, it is LT_UNKNOWN or + // we have multiple LT_START hypotheses), see if there's a model that + // was recently used (an "open" model) which might model it well. + for (int i = row_start_; i < row_end_; i++) { + RowScratchRegisters &row = (*rows_)[i]; + if (row.ri_->num_words == 0) + continue; + + // Step One: + // Figure out if there are "open" models which are left-alined or + // right-aligned. This is important for determining whether the + // "first" word in a row would fit at the "end" of the previous row. + bool left_align_open = false; + bool right_align_open = false; + for (int m = 0; m < OpenModels(i).size(); m++) { + switch (OpenModels(i)[m]->justification()) { + case JUSTIFICATION_LEFT: left_align_open = true; break; + case JUSTIFICATION_RIGHT: right_align_open = true; break; + default: left_align_open = right_align_open = true; + } + } + // Step Two: + // Use that knowledge to figure out if this row is likely to + // start a paragraph. + bool likely_start; + if (i == 0) { + likely_start = true; + } else { + if ((left_align_open && right_align_open) || + (!left_align_open && !right_align_open)) { + likely_start = LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_LEFT) || + LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_RIGHT); + } else if (left_align_open) { + likely_start = LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_LEFT); + } else { + likely_start = LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_RIGHT); + } + } + + // Step Three: + // If this text line seems like an obvious first line of an + // open model, or an obvious continuation of an existing + // modelled paragraph, mark it up. + if (likely_start) { + // Add Start Hypotheses for all Open models that fit. + for (int m = 0; m < OpenModels(i).size(); m++) { + if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { + row.AddStartLine(OpenModels(i)[m]); + } + } + } else { + // Add relevant body line hypotheses. + SetOfModels last_line_models; + if (i > 0) { + (*rows_)[i - 1].StrongHypotheses(&last_line_models); + } else { + theory_->NonCenteredModels(&last_line_models); + } + for (int m = 0; m < last_line_models.size(); m++) { + const ParagraphModel *model = last_line_models[m]; + if (ValidBodyLine(rows_, i, model)) + row.AddBodyLine(model); + } + } + + // Step Four: + // If we're still quite unsure about this line, go through all + // models in our theory and see if this row could be the start + // of any of our models. + if (row.GetLineType() == LT_UNKNOWN || + (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { + SetOfModels all_models; + theory_->NonCenteredModels(&all_models); + for (int m = 0; m < all_models.size(); m++) { + if (ValidFirstLine(rows_, i, all_models[m])) { + row.AddStartLine(all_models[m]); + } + } + } + // Step Five: + // Since we may have updated the hypotheses about this row, we need + // to recalculate the Open models for the rest of rows[i + 1, row_end) + if (row.GetLineType() != LT_UNKNOWN) { + CalculateOpenModels(i + 1, row_end_); + } + } +} + +// ================ Main Paragraph Detection Algorithm ======================= + +// Find out what ParagraphModels are actually used, and discard any +// that are not. +void DiscardUnusedModels(const GenericVector &rows, + ParagraphTheory *theory) { + SetOfModels used_models; + for (int i = 0; i < rows.size(); i++) { + rows[i].StrongHypotheses(&used_models); + } + theory->DiscardUnusedModels(used_models); +} + +// DowngradeWeakestToCrowns: +// Forget any flush-{left, right} models unless we see two or more +// of them in sequence. +// +// In pass 3, we start to classify even flush-left paragraphs (paragraphs +// where the first line and body indent are the same) as having proper Models. +// This is generally dangerous, since if you start imagining that flush-left +// is a typical paragraph model when it is not, it will lead you to chop normal +// indented paragraphs in the middle whenever a sentence happens to start on a +// new line (see "This" above). What to do? +// What we do is to take any paragraph which is flush left and is not +// preceded by another paragraph of the same model and convert it to a "Crown" +// paragraph. This is a weak pseudo-ParagraphModel which is a placeholder +// for later. It means that the paragraph is flush, but it would be desirable +// to mark it as the same model as following text if it fits. This downgrade +// FlushLeft -> CrownLeft -> Model of following paragraph. Means that we +// avoid making flush left Paragraph Models whenever we see a top-of-the-page +// half-of-a-paragraph. and instead we mark it the same as normal body text. +// +// Implementation: +// +// Comb backwards through the row scratch registers, and turn any +// sequences of body lines of equivalent type abutted against the beginning +// or a body or start line of a different type into a crown paragraph. +void DowngradeWeakestToCrowns(int debug_level, + ParagraphTheory *theory, + GenericVector *rows) { + int start; + for (int end = rows->size(); end > 0; end = start) { + // Search back for a body line of a unique type. + const ParagraphModel *model = NULL; + while (end > 0 && + (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) { + end--; + } + if (end == 0) break; + start = end - 1; + while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { + start--; // walk back to the first line that is not the same body type. + } + if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && + StrongModel(model) && + NearlyEqual(model->first_indent(), model->body_indent(), + model->tolerance())) { + start--; + } + start++; + // Now rows[start, end) is a sequence of unique body hypotheses of model. + if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) + continue; + if (!StrongModel(model)) { + while (start > 0 && + CrownCompatible(rows, start - 1, start, model)) + start--; + } + if (start == 0 || + (!StrongModel(model)) || + (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { + // crownify rows[start, end) + const ParagraphModel *crown_model = model; + if (StrongModel(model)) { + if (model->justification() == JUSTIFICATION_LEFT) + crown_model = kCrownLeft; + else + crown_model = kCrownRight; + } + (*rows)[start].SetUnknown(); + (*rows)[start].AddStartLine(crown_model); + for (int row = start + 1; row < end; row++) { + (*rows)[row].SetUnknown(); + (*rows)[row].AddBodyLine(crown_model); + } + } + } + DiscardUnusedModels(*rows, theory); +} + + +// Clear all hypotheses about lines [start, end) and reset margins. +// +// The empty space between the left of a row and the block boundary (and +// similarly for the right) is split into two pieces: margin and indent. +// In initial processing, we assume the block is tight and the margin for +// all lines is set to zero. However, if our first pass does not yield +// models for everything, it may be due to an inset paragraph like a +// block-quote. In that case, we make a second pass over that unmarked +// section of the page and reset the "margin" portion of the empty space +// to the common amount of space at the ends of the lines under consid- +// eration. This would be equivalent to percentile set to 0. However, +// sometimes we have a single character sticking out in the right margin +// of a text block (like the 'r' in 'for' on line 3 above), and we can +// really just ignore it as an outlier. To express this, we allow the +// user to specify the percentile (0..100) of indent values to use as +// the common margin for each row in the run of rows[start, end). +void RecomputeMarginsAndClearHypotheses( + GenericVector *rows, int start, int end, + int percentile) { + if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) + return; + + int lmin, lmax, rmin, rmax; + lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_; + rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_; + for (int i = start; i < end; i++) { + RowScratchRegisters &sr = (*rows)[i]; + sr.SetUnknown(); + if (sr.ri_->num_words == 0) + continue; + UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax); + UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax); + } + STATS lefts(lmin, lmax + 1); + STATS rights(rmin, rmax + 1); + for (int i = start; i < end; i++) { + RowScratchRegisters &sr = (*rows)[i]; + if (sr.ri_->num_words == 0) + continue; + lefts.add(sr.lmargin_ + sr.lindent_, 1); + rights.add(sr.rmargin_ + sr.rindent_, 1); + } + int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0); + int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0); + for (int i = start; i < end; i++) { + RowScratchRegisters &sr = (*rows)[i]; + int ldelta = ignorable_left - sr.lmargin_; + sr.lmargin_ += ldelta; + sr.lindent_ -= ldelta; + int rdelta = ignorable_right - sr.rmargin_; + sr.rmargin_ += rdelta; + sr.rindent_ -= rdelta; + } +} + +// Return the minimum inter-word space in rows[row_start, row_end). +int InterwordSpace(const GenericVector &rows, + int row_start, int row_end) { + if (row_end < row_start + 1) return 1; + bool legit = false; + int natural_space = rows[row_start].ri_->average_interword_space; + for (int i = row_start; i < row_end; i++) { + if (rows[i].ri_->num_words > 1) { + if (!legit) { + natural_space = rows[i].ri_->average_interword_space; + legit = true; + } else { + if (rows[i].ri_->average_interword_space < natural_space) + natural_space = rows[i].ri_->average_interword_space; + } + } + } + return natural_space; +} + +// Return whether the first word on the after line can fit in the space at +// the end of the before line (knowing which way the text is aligned and read). +bool FirstWordWouldHaveFit(const RowScratchRegisters &before, + const RowScratchRegisters &after, + tesseract::ParagraphJustification justification) { + if (before.ri_->num_words == 0 || after.ri_->num_words == 0) + return true; + + if (justification == JUSTIFICATION_UNKNOWN) { + tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); + } + int available_space; + if (justification == JUSTIFICATION_CENTER) { + available_space = before.lindent_ + before.rindent_; + } else { + available_space = before.OffsideIndent(justification); + } + available_space -= before.ri_->average_interword_space; + + if (before.ri_->ltr) + return after.ri_->lword_box.width() < available_space; + return after.ri_->rword_box.width() < available_space; +} + +// Return whether the first word on the after line can fit in the space at +// the end of the before line (not knowing which way the text goes) in a left +// or right alignemnt. +bool FirstWordWouldHaveFit(const RowScratchRegisters &before, + const RowScratchRegisters &after) { + if (before.ri_->num_words == 0 || after.ri_->num_words == 0) + return true; + + int available_space = before.lindent_; + if (before.rindent_ > available_space) + available_space = before.rindent_; + available_space -= before.ri_->average_interword_space; + + if (before.ri_->ltr) + return after.ri_->lword_box.width() < available_space; + return after.ri_->rword_box.width() < available_space; +} + +bool TextSupportsBreak(const RowScratchRegisters &before, + const RowScratchRegisters &after) { + if (before.ri_->ltr) { + return before.ri_->rword_likely_ends_idea && + after.ri_->lword_likely_starts_idea; + } else { + return before.ri_->lword_likely_ends_idea && + after.ri_->rword_likely_starts_idea; + } +} + +bool LikelyParagraphStart(const RowScratchRegisters &before, + const RowScratchRegisters &after) { + return before.ri_->num_words == 0 || + (FirstWordWouldHaveFit(before, after) && + TextSupportsBreak(before, after)); +} + +bool LikelyParagraphStart(const RowScratchRegisters &before, + const RowScratchRegisters &after, + tesseract::ParagraphJustification j) { + return before.ri_->num_words == 0 || + (FirstWordWouldHaveFit(before, after, j) && + TextSupportsBreak(before, after)); +} + +// Examine rows[start, end) and try to determine what sort of ParagraphModel +// would fit them as a single paragraph. +// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN. +// If the rows given could be a consistent start to a paragraph, set *consistent +// true. +ParagraphModel InternalParagraphModelByOutline( + const GenericVector *rows, + int start, int end, int tolerance, bool *consistent) { + int ltr_line_count = 0; + for (int i = start; i < end; i++) { + ltr_line_count += static_cast((*rows)[i].ri_->ltr); + } + bool ltr = (ltr_line_count >= (end - start) / 2); + + *consistent = true; + if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) + return ParagraphModel(); + + // Ensure the caller only passed us a region with a common rmargin and + // lmargin. + int lmargin = (*rows)[start].lmargin_; + int rmargin = (*rows)[start].rmargin_; + int lmin, lmax, rmin, rmax, cmin, cmax; + lmin = lmax = (*rows)[start + 1].lindent_; + rmin = rmax = (*rows)[start + 1].rindent_; + cmin = cmax = 0; + for (int i = start + 1; i < end; i++) { + if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { + tprintf("Margins don't match! Software error.\n"); + *consistent = false; + return ParagraphModel(); + } + UpdateRange((*rows)[i].lindent_, &lmin, &lmax); + UpdateRange((*rows)[i].rindent_, &rmin, &rmax); + UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax); + } + int ldiff = lmax - lmin; + int rdiff = rmax - rmin; + int cdiff = cmax - cmin; + if (rdiff > tolerance && ldiff > tolerance) { + if (cdiff < tolerance * 2) { + if (end - start < 3) + return ParagraphModel(); + return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance); + } + *consistent = false; + return ParagraphModel(); + } + if (end - start < 3) // Don't return a model for two line paras. + return ParagraphModel(); + + // These booleans keep us from saying something is aligned left when the body + // left variance is too large. + bool body_admits_left_alignment = ldiff < tolerance; + bool body_admits_right_alignment = rdiff < tolerance; + + ParagraphModel left_model = + ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_, + (lmin + lmax) / 2, tolerance); + ParagraphModel right_model = + ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_, + (rmin + rmax) / 2, tolerance); + + // These booleans keep us from having an indent on the "wrong side" for the + // first line. + bool text_admits_left_alignment = ltr || left_model.is_flush(); + bool text_admits_right_alignment = !ltr || right_model.is_flush(); + + // At least one of the edges is less than tolerance in variance. + // If the other is obviously ragged, it can't be the one aligned to. + // [Note the last line is included in this raggedness.] + if (tolerance < rdiff) { + if (body_admits_left_alignment && text_admits_left_alignment) + return left_model; + *consistent = false; + return ParagraphModel(); + } + if (tolerance < ldiff) { + if (body_admits_right_alignment && text_admits_right_alignment) + return right_model; + *consistent = false; + return ParagraphModel(); + } + + // At this point, we know the body text doesn't vary much on either side. + + // If the first line juts out oddly in one direction or the other, + // that likely indicates the side aligned to. + int first_left = (*rows)[start].lindent_; + int first_right = (*rows)[start].rindent_; + + if (ltr && body_admits_left_alignment && + (first_left < lmin || first_left > lmax)) + return left_model; + if (!ltr && body_admits_right_alignment && + (first_right < rmin || first_right > rmax)) + return right_model; + + *consistent = false; + return ParagraphModel(); +} + +// Examine rows[start, end) and try to determine what sort of ParagraphModel +// would fit them as a single paragraph. If nothing fits, +// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug +// output if we're debugging. +ParagraphModel ParagraphModelByOutline( + int debug_level, + const GenericVector *rows, + int start, int end, int tolerance) { + bool unused_consistent; + ParagraphModel retval = InternalParagraphModelByOutline( + rows, start, end, tolerance, &unused_consistent); + if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { + tprintf("Could not determine a model for this paragraph:\n"); + PrintRowRange(*rows, start, end); + } + return retval; +} + +// Do rows[start, end) form a single instance of the given paragraph model? +bool RowsFitModel(const GenericVector *rows, + int start, int end, const ParagraphModel *model) { + if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) + return false; + if (!ValidFirstLine(rows, start, model)) return false; + for (int i = start + 1 ; i < end; i++) { + if (!ValidBodyLine(rows, i, model)) return false; + } + return true; +} + +// Examine rows[row_start, row_end) as an independent section of text, +// and mark rows that are exceptionally clear as start-of-paragraph +// and paragraph-body lines. +// +// We presume that any lines surrounding rows[row_start, row_end) may +// have wildly different paragraph models, so we don't key any data off +// of those lines. +// +// We only take the very strongest signals, as we don't want to get +// confused and marking up centered text, poetry, or source code as +// clearly part of a typical paragraph. +void MarkStrongEvidence(GenericVector *rows, + int row_start, int row_end) { + // Record patently obvious body text. + for (int i = row_start + 1; i < row_end; i++) { + const RowScratchRegisters &prev = (*rows)[i - 1]; + RowScratchRegisters &curr = (*rows)[i]; + tesseract::ParagraphJustification typical_justification = + prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (!curr.ri_->rword_likely_starts_idea && + !curr.ri_->lword_likely_starts_idea && + !FirstWordWouldHaveFit(prev, curr, typical_justification)) { + curr.SetBodyLine(); + } + } + + // Record patently obvious start paragraph lines. + // + // It's an extremely good signal of the start of a paragraph that + // the first word would have fit on the end of the previous line. + // However, applying just that signal would have us mark random + // start lines of lineated text (poetry and source code) and some + // centered headings as paragraph start lines. Therefore, we use + // a second qualification for a paragraph start: Not only should + // the first word of this line have fit on the previous line, + // but also, this line should go full to the right of the block, + // disallowing a subsequent word from having fit on this line. + + // First row: + { + RowScratchRegisters &curr = (*rows)[row_start]; + RowScratchRegisters &next = (*rows)[row_start + 1]; + tesseract::ParagraphJustification j = + curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (curr.GetLineType() == LT_UNKNOWN && + !FirstWordWouldHaveFit(curr, next, j) && + (curr.ri_->lword_likely_starts_idea || + curr.ri_->rword_likely_starts_idea)) { + curr.SetStartLine(); + } + } + // Middle rows + for (int i = row_start + 1; i < row_end - 1; i++) { + RowScratchRegisters &prev = (*rows)[i - 1]; + RowScratchRegisters &curr = (*rows)[i]; + RowScratchRegisters &next = (*rows)[i + 1]; + tesseract::ParagraphJustification j = + curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (curr.GetLineType() == LT_UNKNOWN && + !FirstWordWouldHaveFit(curr, next, j) && + LikelyParagraphStart(prev, curr, j)) { + curr.SetStartLine(); + } + } + // Last row + { // the short circuit at the top means we have at least two lines. + RowScratchRegisters &prev = (*rows)[row_end - 2]; + RowScratchRegisters &curr = (*rows)[row_end - 1]; + tesseract::ParagraphJustification j = + curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (curr.GetLineType() == LT_UNKNOWN && + !FirstWordWouldHaveFit(curr, curr, j) && + LikelyParagraphStart(prev, curr, j)) { + curr.SetStartLine(); + } + } +} + +// Look for sequences of a start line followed by some body lines in +// rows[row_start, row_end) and create ParagraphModels for them if +// they seem coherent. +void ModelStrongEvidence(int debug_level, + GenericVector *rows, + int row_start, int row_end, + bool allow_flush_models, + ParagraphTheory *theory) { + if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) + return; + + int start = row_start; + while (start < row_end) { + while (start < row_end && (*rows)[start].GetLineType() != LT_START) + start++; + if (start >= row_end - 1) + break; + + int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space); + int end = start; + ParagraphModel last_model; + bool next_consistent; + do { + ++end; + // rows[row, end) was consistent. + // If rows[row, end + 1) is not consistent, + // just model rows[row, end) + if (end < row_end - 1) { + RowScratchRegisters &next = (*rows)[end]; + LineType lt = next.GetLineType(); + next_consistent = lt == LT_BODY || + (lt == LT_UNKNOWN && + !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end])); + } else { + next_consistent = false; + } + if (next_consistent) { + ParagraphModel next_model = InternalParagraphModelByOutline( + rows, start, end + 1, tolerance, &next_consistent); + if (((*rows)[start].ri_->ltr && + last_model.justification() == JUSTIFICATION_LEFT && + next_model.justification() != JUSTIFICATION_LEFT) || + (!(*rows)[start].ri_->ltr && + last_model.justification() == JUSTIFICATION_RIGHT && + next_model.justification() != JUSTIFICATION_RIGHT)) { + next_consistent = false; + } + last_model = next_model; + } else { + next_consistent = false; + } + } while (next_consistent && end < row_end); + // At this point, rows[start, end) looked like it could have been a + // single paragraph. If we can make a good ParagraphModel for it, + // do so and mark this sequence with that model. + if (end > start + 1) { + // emit a new paragraph if we have more than one line. + const ParagraphModel *model = NULL; + ParagraphModel new_model = ParagraphModelByOutline( + debug_level, rows, start, end, + Epsilon(InterwordSpace(*rows, start, end))); + if (new_model.justification() == JUSTIFICATION_UNKNOWN) { + // couldn't create a good model, oh well. + } else if (new_model.is_flush()) { + if (end == start + 2) { + // It's very likely we just got two paragraph starts in a row. + end = start + 1; + } else if (start == row_start) { + // Mark this as a Crown. + if (new_model.justification() == JUSTIFICATION_LEFT) { + model = kCrownLeft; + } else { + model = kCrownRight; + } + } else if (allow_flush_models) { + model = theory->AddModel(new_model); + } + } else { + model = theory->AddModel(new_model); + } + if (model) { + (*rows)[start].AddStartLine(model); + for (int i = start + 1; i < end; i++) { + (*rows)[i].AddBodyLine(model); + } + } + } + start = end; + } +} + +// We examine rows[row_start, row_end) and do the following: +// (1) Clear all existing hypotheses for the rows being considered. +// (2) Mark up any rows as exceptionally likely to be paragraph starts +// or paragraph body lines as such using both geometric and textual +// clues. +// (3) Form models for any sequence of start + continuation lines. +// (4) Smear the paragraph models to cover surrounding text. +void StrongEvidenceClassify(int debug_level, + GenericVector *rows, + int row_start, int row_end, + ParagraphTheory *theory) { + if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) + return; + + if (debug_level > 1) { + tprintf("#############################################\n"); + tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); + tprintf("#############################################\n"); + } + + RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); + MarkStrongEvidence(rows, row_start, row_end); + + DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows); + + // Create paragraph models. + ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory); + + DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows); + + // At this point, some rows are marked up as paragraphs with model numbers, + // and some rows are marked up as either LT_START or LT_BODY. Now let's + // smear any good paragraph hypotheses forward and backward. + ParagraphModelSmearer smearer(rows, row_start, row_end, theory); + smearer.Smear(); +} + +void SeparateSimpleLeaderLines(GenericVector *rows, + int row_start, int row_end, + ParagraphTheory *theory) { + for (int i = row_start + 1; i < row_end - 1; i++) { + if ((*rows)[i - 1].ri_->has_leaders && + (*rows)[i].ri_->has_leaders && + (*rows)[i + 1].ri_->has_leaders) { + const ParagraphModel *model = theory->AddModel( + ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0)); + (*rows)[i].AddStartLine(model); + } + } +} + +// Collect sequences of unique hypotheses in row registers and create proper +// paragraphs for them, referencing the paragraphs in row_owners. +void ConvertHypothesizedModelRunsToParagraphs( + int debug_level, + const GenericVector &rows, + GenericVector *row_owners, + ParagraphTheory *theory) { + int end = rows.size(); + int start; + for (; end > 0; end = start) { + start = end - 1; + const ParagraphModel *model = NULL; + // TODO(eger): Be smarter about dealing with multiple hypotheses. + bool single_line_paragraph = false; + SetOfModels models; + rows[start].NonNullHypotheses(&models); + if (models.size() > 0) { + model = models[0]; + if (rows[start].GetLineType(model) != LT_BODY) + single_line_paragraph = true; + } + if (model && !single_line_paragraph) { + // walk back looking for more body lines and then a start line. + while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { + // do nothing + } + if (start < 0 || rows[start].GetLineType(model) != LT_START) { + model = NULL; + } + } + if (model == NULL) { + continue; + } + // rows[start, end) should be a paragraph. + PARA *p = new PARA(); + if (model == kCrownLeft || model == kCrownRight) { + p->is_very_first_or_continuation = true; + // Crown paragraph. + // If we can find an existing ParagraphModel that fits, use it, + // else create a new one. + for (int row = end; row < rows.size(); row++) { + if ((*row_owners)[row] && + (ValidBodyLine(&rows, start, (*row_owners)[row]->model) && + (start == 0 || + ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { + model = (*row_owners)[row]->model; + break; + } + } + if (model == kCrownLeft) { + // No subsequent model fits, so cons one up. + model = theory->AddModel(ParagraphModel( + JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_, + 0, 0, Epsilon(rows[start].ri_->average_interword_space))); + } else if (model == kCrownRight) { + // No subsequent model fits, so cons one up. + model = theory->AddModel(ParagraphModel( + JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_, + 0, 0, Epsilon(rows[start].ri_->average_interword_space))); + } + } + rows[start].SetUnknown(); + rows[start].AddStartLine(model); + for (int i = start + 1; i < end; i++) { + rows[i].SetUnknown(); + rows[i].AddBodyLine(model); + } + p->model = model; + p->has_drop_cap = rows[start].ri_->has_drop_cap; + p->is_list_item = + model->justification() == JUSTIFICATION_RIGHT + ? rows[start].ri_->rword_indicates_list_item + : rows[start].ri_->lword_indicates_list_item; + for (int row = start; row < end; row++) { + if ((*row_owners)[row] != NULL) { + tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " + "more than once!\n"); + } + (*row_owners)[row] = p; + } + } +} + +struct Interval { + Interval() : begin(0), end(0) {} + Interval(int b, int e) : begin(b), end(e) {} + + int begin; + int end; +}; + +// Return whether rows[row] appears to be stranded, meaning that the evidence +// for this row is very weak due to context. For instance, two lines of source +// code may happen to be indented at the same tab vector as body text starts, +// leading us to think they are two start-of-paragraph lines. This is not +// optimal. However, we also don't want to mark a sequence of short dialog +// as "weak," so our heuristic is: +// (1) If a line is surrounded by lines of unknown type, it's weak. +// (2) If two lines in a row are start lines for a given paragraph type, but +// after that the same paragraph type does not continue, they're weak. +bool RowIsStranded(const GenericVector &rows, int row) { + SetOfModels row_models; + rows[row].StrongHypotheses(&row_models); + + for (int m = 0; m < row_models.size(); m++) { + bool all_starts = rows[row].GetLineType(); + int run_length = 1; + bool continues = true; + for (int i = row - 1; i >= 0 && continues; i--) { + SetOfModels models; + rows[i].NonNullHypotheses(&models); + switch (rows[i].GetLineType(row_models[m])) { + case LT_START: run_length++; break; + case LT_MULTIPLE: // explicit fall-through + case LT_BODY: run_length++; all_starts = false; break; + case LT_UNKNOWN: // explicit fall-through + default: continues = false; + } + } + continues = true; + for (int i = row + 1; i < rows.size() && continues; i++) { + SetOfModels models; + rows[i].NonNullHypotheses(&models); + switch (rows[i].GetLineType(row_models[m])) { + case LT_START: run_length++; break; + case LT_MULTIPLE: // explicit fall-through + case LT_BODY: run_length++; all_starts = false; break; + case LT_UNKNOWN: // explicit fall-through + default: continues = false; + } + } + if (run_length > 2 || (!all_starts && run_length > 1)) return false; + } + return true; +} + +// Go through rows[row_start, row_end) and gather up sequences that need better +// classification. +// + Sequences of non-empty rows without hypotheses. +// + Crown paragraphs not immediately followed by a strongly modeled line. +// + Single line paragraphs surrounded by text that doesn't match the +// model. +void LeftoverSegments(const GenericVector &rows, + GenericVector *to_fix, + int row_start, int row_end) { + to_fix->clear(); + for (int i = row_start; i < row_end; i++) { + bool needs_fixing = false; + + SetOfModels models; + SetOfModels models_w_crowns; + rows[i].StrongHypotheses(&models); + rows[i].NonNullHypotheses(&models_w_crowns); + if (models.empty() && models_w_crowns.size() > 0) { + // Crown paragraph. Is it followed by a modeled line? + for (int end = i + 1; end < rows.size(); end++) { + SetOfModels end_models; + SetOfModels strong_end_models; + rows[end].NonNullHypotheses(&end_models); + rows[end].StrongHypotheses(&strong_end_models); + if (end_models.size() == 0) { + needs_fixing = true; + break; + } else if (strong_end_models.size() > 0) { + needs_fixing = false; + break; + } + } + } else if (models.empty() && rows[i].ri_->num_words > 0) { + // No models at all. + needs_fixing = true; + } + + if (!needs_fixing && !models.empty()) { + needs_fixing = RowIsStranded(rows, i); + } + + if (needs_fixing) { + if (!to_fix->empty() && to_fix->back().end == i - 1) + to_fix->back().end = i; + else + to_fix->push_back(Interval(i, i)); + } + } + // Convert inclusive intervals to half-open intervals. + for (int i = 0; i < to_fix->size(); i++) { + (*to_fix)[i].end = (*to_fix)[i].end + 1; + } +} + +// Given a set of row_owners pointing to PARAs or NULL (no paragraph known), +// normalize each row_owner to point to an actual PARA, and output the +// paragraphs in order onto paragraphs. +void CanonicalizeDetectionResults( + GenericVector *row_owners, + PARA_LIST *paragraphs) { + GenericVector &rows = *row_owners; + paragraphs->clear(); + PARA_IT out(paragraphs); + PARA *formerly_null = NULL; + for (int i = 0; i < rows.size(); i++) { + if (rows[i] == NULL) { + if (i == 0 || rows[i - 1] != formerly_null) { + rows[i] = formerly_null = new PARA(); + } else { + rows[i] = formerly_null; + continue; + } + } else if (i > 0 && rows[i - 1] == rows[i]) { + continue; + } + out.add_after_then_move(rows[i]); + } +} + +// Main entry point for Paragraph Detection Algorithm. +// +// Given a set of equally spaced textlines (described by row_infos), +// Split them into paragraphs. +// +// Output: +// row_owners - one pointer for each row, to the paragraph it belongs to. +// paragraphs - this is the actual list of PARA objects. +// models - the list of paragraph models referenced by the PARA objects. +// caller is responsible for deleting the models. +void DetectParagraphs(int debug_level, + GenericVector *row_infos, + GenericVector *row_owners, + PARA_LIST *paragraphs, + GenericVector *models) { + GenericVector rows; + ParagraphTheory theory(models); + + // Initialize row_owners to be a bunch of NULL pointers. + row_owners->init_to_size(row_infos->size(), NULL); + + // Set up row scratch registers for the main algorithm. + rows.init_to_size(row_infos->size(), RowScratchRegisters()); + for (int i = 0; i < row_infos->size(); i++) { + rows[i].Init((*row_infos)[i]); + } + + // Pass 1: + // Detect sequences of lines that all contain leader dots (.....) + // These are likely Tables of Contents. If there are three text lines in + // a row with leader dots, it's pretty safe to say the middle one should + // be a paragraph of its own. + SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory); + + DebugDump(debug_level > 1, "End of Pass 1", theory, rows); + + GenericVector leftovers; + LeftoverSegments(rows, &leftovers, 0, rows.size()); + for (int i = 0; i < leftovers.size(); i++) { + // Pass 2a: + // Find any strongly evidenced start-of-paragraph lines. If they're + // followed by two lines that look like body lines, make a paragraph + // model for that and see if that model applies throughout the text + // (that is, "smear" it). + StrongEvidenceClassify(debug_level, &rows, + leftovers[i].begin, leftovers[i].end, &theory); + + // Pass 2b: + // If we had any luck in pass 2a, we got part of the page and didn't + // know how to classify a few runs of rows. Take the segments that + // didn't find a model and reprocess them individually. + GenericVector leftovers2; + LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end); + bool pass2a_was_useful = leftovers2.size() > 1 || + (leftovers2.size() == 1 && + (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size())); + if (pass2a_was_useful) { + for (int j = 0; j < leftovers2.size(); j++) { + StrongEvidenceClassify(debug_level, &rows, + leftovers2[j].begin, leftovers2[j].end, + &theory); + } + } + } + + DebugDump(debug_level > 1, "End of Pass 2", theory, rows); + + // Pass 3: + // These are the dregs for which we didn't have enough strong textual + // and geometric clues to form matching models for. Let's see if + // the geometric clues are simple enough that we could just use those. + LeftoverSegments(rows, &leftovers, 0, rows.size()); + for (int i = 0; i < leftovers.size(); i++) { + GeometricClassify(debug_level, &rows, + leftovers[i].begin, leftovers[i].end, &theory); + } + // Undo any flush models for which there's little evidence. + DowngradeWeakestToCrowns(debug_level, &theory, &rows); + + DebugDump(debug_level > 1, "End of Pass 3", theory, rows); + + // Pass 4: + // Take everything that's still not marked up well and clear all markings. + LeftoverSegments(rows, &leftovers, 0, rows.size()); + for (int i = 0; i < leftovers.size(); i++) { + for (int j = leftovers[i].begin; j < leftovers[i].end; j++) { + rows[j].SetUnknown(); + } + } + + DebugDump(debug_level > 1, "End of Pass 4", theory, rows); + + // Convert all of the unique hypothesis runs to PARAs. + ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, + &theory); + + DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows); + + // Finally, clean up any dangling NULL row paragraph parents. + CanonicalizeDetectionResults(row_owners, paragraphs); +} + +// ============ Code interfacing with the rest of Tesseract ================== + +// Given a Tesseract Iterator pointing to a text line, fill in the paragraph +// detector RowInfo with all relevant information from the row. +void InitializeRowInfo(const MutableIterator &it, RowInfo *info) { + if (it.PageResIt()->row() != NULL) { + ROW *row = it.PageResIt()->row()->row; + info->pix_ldistance = row->lmargin(); + info->pix_rdistance = row->rmargin(); + info->average_interword_space = + row->space() > 0 ? row->space() : MAX(row->x_height(), 1); + info->pix_xheight = row->x_height(); + info->has_leaders = false; + info->has_drop_cap = row->has_drop_cap(); + info->ltr = true; // set below depending on word scripts + } else { + info->pix_ldistance = info->pix_rdistance = 0; + info->average_interword_space = 1; + info->pix_xheight = 1.0; + info->has_leaders = false; + info->has_drop_cap = false; + info->ltr = true; + } + + info->text = ""; + char *text = it.GetUTF8Text(RIL_TEXTLINE); + int num_nonws_chars = strlen(text); // strip trailing space + while (num_nonws_chars > 0 && isspace(text[num_nonws_chars - 1])) + num_nonws_chars--; + if (num_nonws_chars > 0) { + int lspaces = info->pix_ldistance / info->average_interword_space; + for (int i = 0; i < lspaces; i++) + info->text += ' '; + for (int i = 0; i < num_nonws_chars; i++) + info->text += text[i]; + } + delete []text; + + info->num_words = 0; + info->lword_indicates_list_item = false; + info->lword_likely_starts_idea = false; + info->lword_likely_ends_idea = false; + info->rword_indicates_list_item = false; + info->rword_likely_starts_idea = false; + info->rword_likely_ends_idea = false; + + if (info->text.size() == 0) { + info->rword_likely_ends_idea = false; + info->rword_likely_ends_idea = false; + return; + } + + int ltr = 0; + int rtl = 0; + + PAGE_RES_IT page_res_it = *it.PageResIt(); + GenericVector werds; + WERD_RES *word_res = page_res_it.restart_row(); + ROW_RES *this_row = page_res_it.row(); + int num_leaders = 0; + do { + if (word_res && word_res->best_choice->unichar_string().length() > 0) { + werds.push_back(word_res); + ltr += word_res->AnyLtrCharsInWord() ? 1 : 0; + rtl += word_res->AnyRtlCharsInWord() ? 1 : 0; + if (word_res->word->flag(W_REP_CHAR)) num_leaders++; + } + word_res = page_res_it.forward(); + } while (page_res_it.row() == this_row); + + info->has_leaders = num_leaders > 3; + info->num_words = werds.size(); + if (werds.size() > 0) { + WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; + info->lword_text = lword->best_choice->unichar_string().string(); + info->rword_text = rword->best_choice->unichar_string().string(); + info->lword_box = lword->word->bounding_box(); + info->rword_box = rword->word->bounding_box(); + LeftWordAttributes(lword->uch_set, lword->best_choice, + info->lword_text, + &info->lword_indicates_list_item, + &info->lword_likely_starts_idea, + &info->lword_likely_ends_idea); + RightWordAttributes(rword->uch_set, rword->best_choice, + info->rword_text, + &info->rword_indicates_list_item, + &info->rword_likely_starts_idea, + &info->rword_likely_ends_idea); + } + info->ltr = ltr >= rtl; +} + +// This is called after rows have been identified and words are recognized. +// Much of this could be implemented before word recognition, but text helps +// to identify bulleted lists and gives good signals for sentence boundaries. +void DetectParagraphs(int debug_level, + const MutableIterator *block_start, + GenericVector *models) { + // Clear out any preconceived notions. + if (block_start->Empty(RIL_TEXTLINE)) { + return; + } + BLOCK *block = block_start->PageResIt()->block()->block; + block->para_list()->clear(); + bool is_image_block = block->poly_block() && !block->poly_block()->IsText(); + + // Convert the Tesseract structures to RowInfos + // for the paragraph detection algorithm. + MutableIterator row(*block_start); + if (row.Empty(RIL_TEXTLINE)) + return; // end of input already. + + GenericVector row_infos; + do { + if (!row.PageResIt()->row()) + continue; // empty row. + row.PageResIt()->row()->row->set_para(NULL); + row_infos.push_back(RowInfo()); + RowInfo &ri = row_infos.back(); + InitializeRowInfo(row, &ri); + } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && + row.Next(RIL_TEXTLINE)); + + // Run the paragraph detection algorithm. + GenericVector row_owners; + GenericVector the_paragraphs; + if (!is_image_block) { + DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), + models); + } else { + row_owners.init_to_size(row_infos.size(), NULL); + CanonicalizeDetectionResults(&row_owners, block->para_list()); + } + + // Now stitch in the row_owners into the rows. + row = *block_start; + for (int i = 0; i < row_owners.size(); i++) { + while (!row.PageResIt()->row()) + row.Next(RIL_TEXTLINE); + row.PageResIt()->row()->row->set_para(row_owners[i]); + row.Next(RIL_TEXTLINE); + } +} + +} // namespace diff --git a/ccmain/paragraphs.h b/ccmain/paragraphs.h new file mode 100644 index 0000000000..9c11d95c1e --- /dev/null +++ b/ccmain/paragraphs.h @@ -0,0 +1,107 @@ +/********************************************************************** + * File: paragraphs.h + * Description: Paragraph Detection data structures. + * Author: David Eger + * Created: 25 February 2011 + * + * (C) Copyright 2011, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_ +#define TESSERACT_CCMAIN_PARAGRAPHS_H_ + +#include "rect.h" +#include "ocrpara.h" +#include "genericvector.h" +#include "strngs.h" + + +class WERD; +class UNICHARSET; + +namespace tesseract { + +class MutableIterator; + +// This structure captures all information needed about a text line for the +// purposes of paragraph detection. It is meant to be exceedingly light-weight +// so that we can easily test paragraph detection independent of the rest of +// Tesseract. +class RowInfo { + public: + // Constant data derived from Tesseract output. + STRING text; // the full UTF-8 text of the line. + bool ltr; // whether the majority of the text is left-to-right + // TODO(eger) make this more fine-grained. + + bool has_leaders; // does the line contain leader dots (.....)? + bool has_drop_cap; // does the line have a drop cap? + int pix_ldistance; // distance to the left pblock boundary in pixels + int pix_rdistance; // distance to the right pblock boundary in pixels + float pix_xheight; // guessed xheight for the line + int average_interword_space; // average space between words in pixels. + + int num_words; + TBOX lword_box; // in normalized (horiz text rows) space + TBOX rword_box; // in normalized (horiz text rows) space + + STRING lword_text; // the UTF-8 text of the leftmost werd + STRING rword_text; // the UTF-8 text of the rightmost werd + + // The text of a paragraph typically starts with the start of an idea and + // ends with the end of an idea. Here we define paragraph as something that + // may have a first line indent and a body indent which may be different. + // Typical words that start an idea are: + // 1. Words in western scripts that start with + // a capital letter, for example "The" + // 2. Bulleted or numbered list items, for + // example "2." + // Typical words which end an idea are words ending in punctuation marks. In + // this vocabulary, each list item is represented as a paragraph. + bool lword_indicates_list_item; + bool lword_likely_starts_idea; + bool lword_likely_ends_idea; + + bool rword_indicates_list_item; + bool rword_likely_starts_idea; + bool rword_likely_ends_idea; +}; + +// Main entry point for Paragraph Detection Algorithm. +// +// Given a set of equally spaced textlines (described by row_infos), +// Split them into paragraphs. See http://goto/paragraphstalk +// +// Output: +// row_owners - one pointer for each row, to the paragraph it belongs to. +// paragraphs - this is the actual list of PARA objects. +// models - the list of paragraph models referenced by the PARA objects. +// caller is responsible for deleting the models. +void DetectParagraphs(int debug_level, + GenericVector *row_infos, + GenericVector *row_owners, + PARA_LIST *paragraphs, + GenericVector *models); + +// Given a MutableIterator to the start of a block, run DetectParagraphs on +// that block and commit the results to the underlying ROW and BLOCK structs, +// saving the ParagraphModels in models. Caller owns the models. +// We use unicharset during the function to answer questions such as "is the +// first letter of this word upper case?" +void DetectParagraphs(int debug_level, + const MutableIterator *block_start, + GenericVector *models); + +} // namespace + +#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_ diff --git a/ccmain/paragraphs_internal.h b/ccmain/paragraphs_internal.h new file mode 100644 index 0000000000..c622290f9a --- /dev/null +++ b/ccmain/paragraphs_internal.h @@ -0,0 +1,308 @@ +/********************************************************************** + * File: paragraphs.h + * Description: Paragraph Detection internal data structures. + * Author: David Eger + * Created: 11 March 2011 + * + * (C) Copyright 2011, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ +#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ + +#include "paragraphs.h" +#include "strings.h" + +// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS +// DATA STRUCTURES OR FUNCTIONS IN THIS FILE. + +class WERD_CHOICE; + +namespace tesseract { + +// Return whether the given word is likely to be a list item start word. +bool AsciiLikelyListItem(const STRING &word); + +// Return the first Unicode Codepoint from werd[pos]. +int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos); + +// Set right word attributes given either a unicharset and werd or a utf8 +// string. +void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, + const STRING &utf8, + bool *is_list, bool *starts_idea, bool *ends_idea); + +// Set left word attributes given either a unicharset and werd or a utf8 string. +void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, + const STRING &utf8, + bool *is_list, bool *starts_idea, bool *ends_idea); + +enum LineType { + LT_START = 'S', // First line of a paragraph. + LT_BODY = 'C', // Continuation line of a paragraph. + LT_UNKNOWN = 'U', // No clues. + LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY. +}; + +// The first paragraph in a page of body text is often un-indented. +// This is a typographic convention which is common to indicate either that: +// (1) The paragraph is the continuation of a previous paragraph, or +// (2) The paragraph is the first paragraph in a chapter. +// +// I refer to such paragraphs as "crown"s, and the output of the paragraph +// detection algorithm attempts to give them the same paragraph model as +// the rest of the body text. +// +// Nonetheless, while building hypotheses, it is useful to mark the lines +// of crown paragraphs temporarily as crowns, either aligned left or right. +extern const ParagraphModel *kCrownLeft; +extern const ParagraphModel *kCrownRight; + +inline bool StrongModel(const ParagraphModel *model) { + return model != NULL && model != kCrownLeft && model != kCrownRight; +} + +struct LineHypothesis { + LineHypothesis() : ty(LT_UNKNOWN), model(NULL) {} + LineHypothesis(LineType line_type, const ParagraphModel *m) + : ty(line_type), model(m) {} + LineHypothesis(const LineHypothesis &other) + : ty(other.ty), model(other.model) {} + + bool operator==(const LineHypothesis &other) const { + return ty == other.ty && model == other.model; + } + + LineType ty; + const ParagraphModel *model; +}; + +class ParagraphTheory; // Forward Declaration + +typedef GenericVectorEqEq SetOfModels; + +// Row Scratch Registers are data generated by the paragraph detection +// algorithm based on a RowInfo input. +class RowScratchRegisters { + public: + // We presume row will outlive us. + void Init(const RowInfo &row); + + LineType GetLineType() const; + + LineType GetLineType(const ParagraphModel *model) const; + + // Mark this as a start line type, sans model. This is useful for the + // initial marking of probable body lines or paragraph start lines. + void SetStartLine(); + + // Mark this as a body line type, sans model. This is useful for the + // initial marking of probably body lines or paragraph start lines. + void SetBodyLine(); + + // Record that this row fits as a paragraph start line in the given model, + void AddStartLine(const ParagraphModel *model); + // Record that this row fits as a paragraph body line in the given model, + void AddBodyLine(const ParagraphModel *model); + + // Clear all hypotheses about this line. + void SetUnknown() { hypotheses_.truncate(0); } + + // Append all hypotheses of strong models that match this row as a start. + void StartHypotheses(SetOfModels *models) const; + + // Append all hypotheses of strong models matching this row. + void StrongHypotheses(SetOfModels *models) const; + + // Append all hypotheses for this row. + void NonNullHypotheses(SetOfModels *models) const; + + // Discard any hypotheses whose model is not in the given list. + void DiscardNonMatchingHypotheses(const SetOfModels &models); + + // If we have only one hypothesis and that is that this line is a paragraph + // start line of a certain model, return that model. Else return NULL. + const ParagraphModel *UniqueStartHypothesis() const; + + // If we have only one hypothesis and that is that this line is a paragraph + // body line of a certain model, return that model. Else return NULL. + const ParagraphModel *UniqueBodyHypothesis() const; + + // Return the indentation for the side opposite of the aligned side. + int OffsideIndent(tesseract::ParagraphJustification just) const { + switch (just) { + case tesseract::JUSTIFICATION_RIGHT: return lindent_; + case tesseract::JUSTIFICATION_LEFT: return rindent_; + default: return lindent_ > rindent_ ? lindent_ : rindent_; + } + } + + // Return the indentation for the side the text is aligned to. + int AlignsideIndent(tesseract::ParagraphJustification just) const { + switch (just) { + case tesseract::JUSTIFICATION_RIGHT: return rindent_; + case tesseract::JUSTIFICATION_LEFT: return lindent_; + default: return lindent_ > rindent_ ? lindent_ : rindent_; + } + } + + // Append header fields to a vector of row headings. + static void AppendDebugHeaderFields(GenericVector *header); + + // Append data for this row to a vector of debug strings. + void AppendDebugInfo(const ParagraphTheory &theory, + GenericVector *dbg) const; + + const RowInfo *ri_; + + // These four constants form a horizontal box model for the white space + // on the edges of each line. At each point in the algorithm, the following + // shall hold: + // ri_->pix_ldistance = lmargin_ + lindent_ + // ri_->pix_rdistance = rindent_ + rmargin_ + int lmargin_; + int lindent_; + int rindent_; + int rmargin_; + + private: + // Hypotheses of either LT_START or LT_BODY + GenericVectorEqEq hypotheses_; +}; + +// A collection of convenience functions for wrapping the set of +// Paragraph Models we believe correctly model the paragraphs in the image. +class ParagraphTheory { + public: + // We presume models will outlive us, and that models will take ownership + // of any ParagraphModel *'s we add. + explicit ParagraphTheory(GenericVector *models) + : models_(models) {} + GenericVector &models() { return *models_; } + const GenericVector &models() const { return *models_; } + + // Return an existing model if one that is Comparable() can be found. + // Else, allocate a new copy of model to save and return a pointer to it. + const ParagraphModel *AddModel(const ParagraphModel &model); + + // Discard any models we've made that are not in the list of used models. + void DiscardUnusedModels(const SetOfModels &used_models); + + // Return the set of all non-centered models. + void NonCenteredModels(SetOfModels *models); + + // If any of the non-centered paragraph models we know about fit + // rows[start, end), return it. Else NULL. + const ParagraphModel *Fits(const GenericVector *rows, + int start, int end) const; + + int IndexOf(const ParagraphModel *model) const; + + private: + GenericVector *models_; + GenericVectorEqEq models_we_added_; +}; + +bool ValidFirstLine(const GenericVector *rows, + int row, const ParagraphModel *model); +bool ValidBodyLine(const GenericVector *rows, + int row, const ParagraphModel *model); +bool CrownCompatible(const GenericVector *rows, + int a, int b, const ParagraphModel *model); + +// A class for smearing Paragraph Model hypotheses to surrounding rows. +// The idea here is that StrongEvidenceClassify first marks only exceedingly +// obvious start and body rows and constructs models of them. Thereafter, +// we may have left over unmarked lines (mostly end-of-paragraph lines) which +// were too short to have much confidence about, but which fit the models we've +// constructed perfectly and which we ought to mark. This class is used to +// "smear" our models over the text. +class ParagraphModelSmearer { + public: + ParagraphModelSmearer(GenericVector *rows, + int row_start, int row_end, + ParagraphTheory *theory); + + // Smear forward paragraph models from existing row markings to subsequent + // text lines if they fit, and mark any thereafter still unmodeled rows + // with any model in the theory that fits them. + void Smear(); + + private: + // Record in open_models_ for rows [start_row, end_row) the list of models + // currently open at each row. + // A model is still open in a row if some previous row has said model as a + // start hypothesis, and all rows since (including this row) would fit as + // either a body or start line in that model. + void CalculateOpenModels(int row_start, int row_end); + + SetOfModels &OpenModels(int row) { + return open_models_[row - row_start_ + 1]; + } + + ParagraphTheory *theory_; + GenericVector *rows_; + int row_start_; + int row_end_; + + // open_models_ corresponds to rows[start_row_ - 1, end_row_] + // + // open_models_: Contains models which there was an active (open) paragraph + // as of the previous line and for which the left and right + // indents admit the possibility that this text line continues + // to fit the same model. + // TODO(eger): Think about whether we can get rid of "Open" models and just + // use the current hypotheses on RowScratchRegisters. + GenericVector open_models_; +}; + +// Clear all hypotheses about lines [start, end) and reset the margins to the +// percentile (0..100) value of the left and right row edges for this run of +// rows. +void RecomputeMarginsAndClearHypotheses( + GenericVector *rows, int start, int end, + int percentile); + +// Return the minimum inter-word space in rows[row_start, row_end). +int InterwordSpace(const GenericVector &rows, + int row_start, int row_end); + +// Return whether the first word on the after line can fit in the space at +// the end of the before line (knowing which way the text is aligned and read). +bool FirstWordWouldHaveFit(const RowScratchRegisters &before, + const RowScratchRegisters &after, + tesseract::ParagraphJustification justification); + +// Return whether the first word on the after line can fit in the space at +// the end of the before line (not knowing the text alignment). +bool FirstWordWouldHaveFit(const RowScratchRegisters &before, + const RowScratchRegisters &after); + +// Do rows[start, end) form a single instance of the given paragraph model? +bool RowsFitModel(const GenericVector *rows, + int start, int end, const ParagraphModel *model); + +// Do the text and geometry of two rows support a paragraph break between them? +bool LikelyParagraphStart(const RowScratchRegisters &before, + const RowScratchRegisters &after, + tesseract::ParagraphJustification j); + +// Given a set of row_owners pointing to PARAs or NULL (no paragraph known), +// normalize each row_owner to point to an actual PARA, and output the +// paragraphs in order onto paragraphs. +void CanonicalizeDetectionResults( + GenericVector *row_owners, + PARA_LIST *paragraphs); + +} // namespace +#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp index a31c57f7f6..8209366133 100755 --- a/ccmain/pgedit.cpp +++ b/ccmain/pgedit.cpp @@ -31,20 +31,17 @@ #include #include -#include "tordmain.h" -#include "statistc.h" -#include "svshowim.h" -#include "paramsd.h" -#include "string.h" - -#include "scrollview.h" -#include "svmnode.h" - -#include "control.h" +#include "blread.h" +#include "control.h" +#include "svshowim.h" +#include "paramsd.h" +#include "pageres.h" +#include "tordmain.h" +#include "scrollview.h" +#include "svmnode.h" +#include "statistc.h" #include "tesseractclass.h" -#include "blread.h" - #ifndef GRAPHICS_DISABLED #define ASC_HEIGHT (2 * kBlnBaselineOffset + kBlnXHeight) #define X_HEIGHT (kBlnBaselineOffset + kBlnXHeight) @@ -62,6 +59,7 @@ enum CMD_EVENTS SHOW_POINT_CMD_EVENT, SHOW_BLN_WERD_CMD_EVENT, DEBUG_WERD_CMD_EVENT, + BLAMER_CMD_EVENT, BOUNDING_BOX_CMD_EVENT, CORRECT_TEXT_CMD_EVENT, POLYGONAL_CMD_EVENT, @@ -116,6 +114,8 @@ ScrollView* bln_word_window = NULL; // baseline norm words CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op +bool recog_done = false; // recog_all_words was called + // These variables should remain global, since they are only used for the // debug mode (in which only a single Tesseract thread/instance will be exist). BITS16 word_display_mode; @@ -195,8 +195,8 @@ void build_image_window(int width, int height) { editor_image_xpos, editor_image_ypos, width + 1, height + editor_image_menuheight + 1, - width + 1, - height + 1, + width, + height, true); } @@ -269,6 +269,7 @@ SVMenuNode *Tesseract::build_menu_new() { parent_menu = root_menu_item->AddChild("DISPLAY"); + parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE); parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE); parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE); parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE); @@ -333,11 +334,11 @@ void Tesseract::do_re_display( */ void Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) { - current_page_res = page_res; if (current_page_res->block_res_list.empty()) return; + recog_done = false; stillRunning = true; build_image_window(width, height); @@ -400,6 +401,28 @@ BOOL8 Tesseract::process_cmd_win_event( // UI command semantics BOOL8 exit = FALSE; color_mode = CM_RAINBOW; + + // Run recognition on the full page if needed. + switch (cmd_event) { + case BLAMER_CMD_EVENT: + case SHOW_SUBSCRIPT_CMD_EVENT: + case SHOW_SUPERSCRIPT_CMD_EVENT: + case SHOW_ITALIC_CMD_EVENT: + case SHOW_BOLD_CMD_EVENT: + case SHOW_UNDERLINE_CMD_EVENT: + case SHOW_FIXEDPITCH_CMD_EVENT: + case SHOW_SERIF_CMD_EVENT: + case SHOW_SMALLCAPS_CMD_EVENT: + case SHOW_DROPCAPS_CMD_EVENT: + if (!recog_done) { + recog_all_words(current_page_res, NULL, NULL, NULL, 0); + recog_done = true; + } + break; + default: + break; + } + switch (cmd_event) { case NULL_CMD_EVENT: break; @@ -423,6 +446,14 @@ BOOL8 Tesseract::process_cmd_win_event( // UI command semantics word_display_mode.turn_off_bit(DF_BOX); mode = CHANGE_DISP_CMD_EVENT; break; + case BLAMER_CMD_EVENT: + if (new_value[0] == 'T') + word_display_mode.turn_on_bit(DF_BLAMER); + else + word_display_mode.turn_off_bit(DF_BLAMER); + do_re_display(&tesseract::Tesseract::word_display); + mode = CHANGE_DISP_CMD_EVENT; + break; case CORRECT_TEXT_CMD_EVENT: if (new_value[0] == 'T') word_display_mode.turn_on_bit(DF_TEXT); @@ -691,7 +722,9 @@ BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row, BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) { TWERD *bln_word = word_res->chopped_word; if (bln_word == NULL) { - word_res->SetupForRecognition(unicharset, false, row, block); + word_res->SetupForTessRecognition(unicharset, this, BestPix(), false, + this->textord_use_cjk_fp_model, + row, block); bln_word = word_res->chopped_word; } bln_word_window_handle()->Clear(); @@ -720,10 +753,8 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) { if (color_mode != CM_RAINBOW && word_res->box_word != NULL) { BoxWord* box_word = word_res->box_word; int length = box_word->length(); - int font_id = word_res->fontinfo_id; - if (font_id < 0) font_id = 0; - const UnicityTable &font_table = get_fontinfo_table(); - FontInfo font_info = font_table.get(font_id); + if (word_res->fontinfo == NULL) return false; + const FontInfo& font_info = *word_res->fontinfo; for (int i = 0; i < length; ++i) { ScrollView::Color color = ScrollView::GREEN; switch (color_mode) { @@ -806,25 +837,56 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) { displayed_something = TRUE; } - // display correct text + // Display correct text and blamer information. + STRING text; + STRING blame; if (word->display_flag(DF_TEXT) && word->text() != NULL) { + text = word->text(); + } + if (word->display_flag(DF_BLAMER) && + !(word_res->blamer_bundle != NULL && + word_res->blamer_bundle->incorrect_result_reason == IRR_CORRECT)) { + text = ""; + const BlamerBundle *blamer_bundle = word_res->blamer_bundle; + if (blamer_bundle == NULL) { + text += "NULL"; + } else { + for (int i = 0; i < blamer_bundle->truth_text.length(); ++i) { + text += blamer_bundle->truth_text[i]; + } + } + text += " -> "; + STRING best_choice_str; + if (word_res->best_choice == NULL) { + best_choice_str = "NULL"; + } else { + word_res->best_choice->string_and_lengths(&best_choice_str, NULL); + } + text += best_choice_str; + IncorrectResultReason reason = (blamer_bundle == NULL) ? + IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason; + ASSERT_HOST(reason < IRR_NUM_REASONS) + blame += " ["; + blame += BlamerBundle::IncorrectReasonName(reason); + blame += "]"; + } + if (text.length() > 0) { word_bb = word->bounding_box(); - ScrollView::Color c =(ScrollView::Color) - ((inT32) editor_image_blob_bb_color); - image_win->Pen(c); + image_win->Pen(ScrollView::RED); word_height = word_bb.height(); - image_win->TextAttributes("Times", 0.75 * word_height, - false, false, false); - if (word_height < word_bb.width()) - shift = 0.25 * word_height; - else - shift = 0.0f; - + int text_height = 0.50 * word_height; + if (text_height > 20) text_height = 20; + image_win->TextAttributes("Arial", text_height, false, false, false); + shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f; image_win->Text(word_bb.left() + shift, - word_bb.bottom() + 0.25 * word_height, word->text()); + word_bb.bottom() + 0.25 * word_height, text.string()); + if (blame.length() > 0) { + image_win->Text(word_bb.left() + shift, + word_bb.bottom() + 0.25 * word_height - text_height, + blame.string()); + } - if (strlen(word->text()) > 0) - displayed_something = TRUE; + displayed_something = TRUE; } if (!displayed_something) // display BBox anyway @@ -849,6 +911,11 @@ BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) { row->print(NULL); tprintf("\nWord data...\n"); word_res->word->print(); + if (word_res->blamer_bundle != NULL && wordrec_debug_blamer && + word_res->blamer_bundle->incorrect_result_reason != IRR_CORRECT) { + tprintf("Current blamer debug: %s\n", + word_res->blamer_bundle->debug.string()); + } return TRUE; } @@ -866,6 +933,7 @@ BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) { word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP)); word->set_display_flag(DF_BN_POLYGONAL, word_display_mode.bit(DF_BN_POLYGONAL)); + word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER)); return word_display(block, row, word_res); } } // namespace tesseract diff --git a/ccmain/recogtraining.cpp b/ccmain/recogtraining.cpp index 8c4b7e1ce3..5047ff6629 100644 --- a/ccmain/recogtraining.cpp +++ b/ccmain/recogtraining.cpp @@ -38,7 +38,7 @@ FILE *Tesseract::init_recog_training(const STRING &fname) { if (tessedit_ambigs_training) { tessedit_tess_adaption_mode.set_value(0); // turn off adaption tessedit_enable_doc_dict.set_value(0); // turn off document dictionary - save_best_choices.set_value(1); // save individual char choices + save_blob_choices.set_value(1); // save individual char choices getDict().save_raw_choices.set_value(1); // save raw choices getDict().permute_only_top.set_value(true); // use only top choice permuter tessedit_ok_mode.set_value(0); // turn off context checking @@ -56,22 +56,24 @@ FILE *Tesseract::init_recog_training(const STRING &fname) { // Copies the bounding box from page_res_it->word() to the given TBOX. bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) { + while (page_res_it->block() != NULL) { + if (page_res_it->word() != NULL) + break; + page_res_it->forward(); + } + if (page_res_it->word() != NULL) { *tbox = page_res_it->word()->word->bounding_box(); page_res_it->forward(); - return true; - } else { - return false; - } -} -// Reads the next box from the given box file into TBOX. -bool read_b(int applybox_page, int *line_number, FILE *box_file, - char *label, TBOX *bbox) { - int x_min, y_min, x_max, y_max; - if (read_next_box(applybox_page, line_number, box_file, label, - &x_min, &y_min, &x_max, &y_max)) { - bbox->set_to_given_coords(x_min, y_min, x_max, y_max); + // If tbox->left() is negative, the training image has vertical text and + // all the coordinates of bounding boxes of page_res are rotated by 90 + // degrees in a counterclockwise direction. We need to rotate the TBOX back + // in order to compare with the TBOXes of box files. + if (tbox->left() < 0) { + tbox->rotate(FCOORD(0.0, -1.0)); + } + return true; } else { return false; @@ -97,27 +99,29 @@ void Tesseract::recog_training_segmented(const STRING &fname, PAGE_RES_IT page_res_it; page_res_it.page_res = page_res; page_res_it.restart_page(); - char label[kBoxReadBufSize]; + STRING label; // Process all the words on this page. TBOX tbox; // tesseract-identified box TBOX bbox; // box from the box file bool keep_going; int line_number = 0; + int examined_words = 0; do { keep_going = read_t(&page_res_it, &tbox); - keep_going &= read_b(applybox_page, &line_number, box_file, label, &bbox); + keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, + &bbox); // Align bottom left points of the TBOXes. while (keep_going && !NearlyEqual(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { keep_going = (bbox.bottom() < tbox.bottom()) ? read_t(&page_res_it, &tbox) : - read_b(applybox_page, &line_number, box_file, label, &bbox); + ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); } while (keep_going && !NearlyEqual(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) : - read_b(applybox_page, &line_number, box_file, label, &bbox); + ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); } // OCR the word if top right points of the TBOXes are similar. if (keep_going && @@ -126,9 +130,30 @@ void Tesseract::recog_training_segmented(const STRING &fname, ambigs_classify_and_output(page_res_it.prev_word(), page_res_it.prev_row(), page_res_it.prev_block(), - label, output_file); + label.string(), output_file); + examined_words++; } } while (keep_going); + + // Set up scripts on all of the words that did not get sent to + // ambigs_classify_and_output. They all should have, but if all the + // werd_res's don't get uch_sets, tesseract will crash when you try + // to iterate over them. :-( + int total_words = 0; + for (page_res_it.restart_page(); page_res_it.block() != NULL; + page_res_it.forward()) { + if (page_res_it.word()) { + if (page_res_it.word()->uch_set == NULL) + page_res_it.word()->SetupFake(unicharset); + total_words++; + } + } + if (examined_words < 0.85 * total_words) { + tprintf("TODO(antonova): clean up recog_training_segmented; " + " It examined only a small fraction of the ambigs image.\n"); + } + tprintf("recog_training_segmented: examined %d / %d words.\n", + examined_words, total_words); } // Runs classify_word_pass1() on the current word. Outputs Tesseract's @@ -142,7 +167,8 @@ void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res, FILE *output_file) { int offset; // Classify word. - classify_word_pass1(werd_res, row_res->row, block_res->block); + fflush(stdout); + classify_word_pass1(block_res->block, row_res->row, werd_res); WERD_CHOICE *best_choice = werd_res->best_choice; ASSERT_HOST(best_choice != NULL); ASSERT_HOST(best_choice->blob_choices() != NULL); @@ -151,7 +177,7 @@ void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res, int label_num_unichars = 0; int step = 1; // should be non-zero on the first iteration for (offset = 0; label[offset] != '\0' && step > 0; - step = getDict().getUnicharset().step(label + offset), + step = werd_res->uch_set->step(label + offset), offset += step, ++label_num_unichars); if (step == 0) { tprintf("Not outputting illegal unichar %s\n", label); diff --git a/ccmain/reject.cpp b/ccmain/reject.cpp index f3052008f7..94497bfa8a 100644 --- a/ccmain/reject.cpp +++ b/ccmain/reject.cpp @@ -202,9 +202,9 @@ void Tesseract::make_reject_map( //make rej map for wd //detailed results int offset; flip_0O(word); - check_debug_pt (word, -1); //For trap only - set_done(word, pass); //Set acceptance - word->reject_map.initialise (word->best_choice->unichar_lengths().length ()); + check_debug_pt(word, -1); // For trap only + set_done(word, pass); // Set acceptance + word->reject_map.initialise(word->best_choice->unichar_lengths().length()); reject_blanks(word); /* 0: Rays original heuristic - the baseline @@ -212,16 +212,15 @@ void Tesseract::make_reject_map( //make rej map for wd //detailed results if (tessedit_reject_mode == 0) { if (!word->done) reject_poor_matches(word, blob_choices); - } - /* - 5: Reject I/1/l from words where there is no strong contextual confirmation; - the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); - and the whole of any words which are very small - */ - else if (tessedit_reject_mode == 5) { - if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) - word->reject_map.rej_word_small_xht (); - else { + } else if (tessedit_reject_mode == 5) { + /* + 5: Reject I/1/l from words where there is no strong contextual confirmation; + the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); + and the whole of any words which are very small + */ + if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { + word->reject_map.rej_word_small_xht(); + } else { one_ell_conflict(word, TRUE); /* Originally the code here just used the done flag. Now I have duplicated @@ -236,42 +235,38 @@ void Tesseract::make_reject_map( //make rej map for wd //detailed results (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) word->reject_map.rej_word_contains_blanks (); + WERD_CHOICE* best_choice = word->best_choice; if (rej_use_good_perm) { - if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || - (word->best_choice->permuter () == FREQ_DAWG_PERM) || - (word->best_choice->permuter () == USER_DAWG_PERM)) && - (!rej_use_sensible_wd || - (acceptable_word_string - (word->best_choice->unichar_string().string (), - word->best_choice->unichar_lengths().string ()) != - AC_UNACCEPTABLE))) { - //PASSED TEST - } - else if (word->best_choice->permuter () == NUMBER_PERM) { + if ((best_choice->permuter() == SYSTEM_DAWG_PERM || + best_choice->permuter() == FREQ_DAWG_PERM || + best_choice->permuter() == USER_DAWG_PERM) && + (!rej_use_sensible_wd || + acceptable_word_string(*word->uch_set, + best_choice->unichar_string().string(), + best_choice->unichar_lengths().string()) != + AC_UNACCEPTABLE)) { + // PASSED TEST + } else if (best_choice->permuter() == NUMBER_PERM) { if (rej_alphas_in_number_perm) { for (i = 0, offset = 0; - word->best_choice->unichar_string()[offset] != '\0'; - offset += word->best_choice->unichar_lengths()[i++]) { - if (word->reject_map[i].accepted () && - unicharset.get_isalpha( - word->best_choice->unichar_string().string() + offset, - word->best_choice->unichar_lengths()[i])) - word->reject_map[i].setrej_bad_permuter (); - //rej alpha + best_choice->unichar_string()[offset] != '\0'; + offset += best_choice->unichar_lengths()[i++]) { + if (word->reject_map[i].accepted() && + word->uch_set->get_isalpha( + best_choice->unichar_string().string() + offset, + best_choice->unichar_lengths()[i])) + word->reject_map[i].setrej_bad_permuter(); + // rej alpha } } - } - else { - word->reject_map.rej_word_bad_permuter (); + } else { + word->reject_map.rej_word_bad_permuter(); } } - /* Ambig word rejection was here once !!*/ - } - } - else { - tprintf ("BAD tessedit_reject_mode\n"); + } else { + tprintf("BAD tessedit_reject_mode\n"); err_exit(); } @@ -280,14 +275,14 @@ void Tesseract::make_reject_map( //make rej map for wd //detailed results check_debug_pt (word, 10); if (tessedit_rejection_debug) { - tprintf ("Permuter Type = %d\n", word->best_choice->permuter ()); - tprintf ("Certainty: %f Rating: %f\n", + tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); + tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty (), word->best_choice->rating ()); tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); } flip_hyphens(word); - check_debug_pt (word, 20); + check_debug_pt(word, 20); } } // namespace tesseract @@ -492,8 +487,8 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { for (i = 0, offset = 0, non_conflict_set_char = FALSE; (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) non_conflict_set_char = - (unicharset.get_isalpha(word + offset, lengths[i]) || - unicharset.get_isdigit(word + offset, lengths[i])) && + (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || + word_res->uch_set->get_isdigit(word + offset, lengths[i])) && !STRING (conflict_set_I_l_1).contains (word[offset]); if (!non_conflict_set_char) { if (update_map) @@ -524,7 +519,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; - if (safe_dict_word(*(word_res->best_choice)) > 0) { + if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (update_map) word_res->reject_map[first_alphanum_index_]. @@ -540,7 +535,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; - if (safe_dict_word(*(word_res->best_choice)) > 0) { + if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (update_map) word_res->reject_map[first_alphanum_index_]. @@ -571,7 +566,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; - if (safe_dict_word(*(word_res->best_choice)) > 0) + if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; @@ -579,7 +574,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; - if (safe_dict_word(*(word_res->best_choice)) > 0) + if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; @@ -611,7 +606,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { For anything else. See if it conforms to an acceptable word type. If so, treat accordingly. */ - word_type = acceptable_word_string (word, lengths); + word_type = acceptable_word_string(*word_res->uch_set, word, lengths); if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); @@ -721,8 +716,8 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) { if (STRING(conflict_set_I_l_1).contains(s[offset])) { accepted_1Il = TRUE; } else { - if (unicharset.get_isalpha(s + offset, lengths[i]) || - unicharset.get_isdigit(s + offset, lengths[i])) + if (word->uch_set->get_isalpha(s + offset, lengths[i]) || + word->uch_set->get_isdigit(s + offset, lengths[i])) return; // >=1 non 1Il ch accepted } } @@ -744,8 +739,8 @@ inT16 Tesseract::count_alphanums(WERD_RES *word_res) { const WERD_CHOICE *best_choice = word_res->best_choice; for (int i = 0; i < word_res->reject_map.length(); ++i) { if ((word_res->reject_map[i].accepted()) && - (unicharset.get_isalpha(best_choice->unichar_id(i)) || - unicharset.get_isdigit(best_choice->unichar_id(i)))) { + (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) || + word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) { count++; } } @@ -788,8 +783,9 @@ BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { return FALSE; } -inT16 Tesseract::safe_dict_word(const WERD_CHOICE &word) { - int dict_word_type = dict_word(word); +inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) { + const WERD_CHOICE &word = *werd_res->best_choice; + int dict_word_type = werd_res->tesseract->dict_word(word); return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; } @@ -809,7 +805,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) { return; TBLOB* blob = word_res->rebuild_word->blobs; - UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-"); + UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); bool modified = false; for (i = 0; i < best_choice->length() && blob != NULL; ++i, blob = blob->next) { @@ -822,10 +818,10 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) { if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) && (out_box.right() < next_left)) { aspect_ratio = out_box.width() / (float) out_box.height(); - if (unicharset.eq(best_choice->unichar_id(i), ".")) { + if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { if (aspect_ratio >= tessedit_upper_flip_hyphen && - unicharset.contains_unichar_id(unichar_dash) && - unicharset.get_enabled(unichar_dash)) { + word_res->uch_set->contains_unichar_id(unichar_dash) && + word_res->uch_set->get_enabled(unichar_dash)) { /* Certain HYPHEN */ best_choice->set_unichar_id(unichar_dash, i); modified = true; @@ -852,7 +848,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) { prev_right = out_box.right(); } if (modified) { - best_choice->populate_unichars(unicharset); + best_choice->populate_unichars(); } } @@ -871,18 +867,20 @@ void Tesseract::flip_0O(WERD_RES *word_res) { TBLOB* blob = word_res->rebuild_word->blobs; for (i = 0; i < best_choice->length() && blob != NULL; ++i, blob = blob->next) { - if (unicharset.get_isupper(best_choice->unichar_id(i)) || - unicharset.get_isdigit(best_choice->unichar_id(i))) { + if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || + word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { out_box = blob->bounding_box(); if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) return; //Beware words with sub/superscripts } } - UNICHAR_ID unichar_0 = unicharset.unichar_to_id("0"); - UNICHAR_ID unichar_O = unicharset.unichar_to_id("O"); - if (unichar_0 == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_0) || - unichar_O == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_O)) { + UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); + UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); + if (unichar_0 == INVALID_UNICHAR_ID || + !word_res->uch_set->get_enabled(unichar_0) || + unichar_O == INVALID_UNICHAR_ID || + !word_res->uch_set->get_enabled(unichar_O)) { return; // 0 or O are not present/enabled in unicharset } bool modified = false; @@ -891,43 +889,43 @@ void Tesseract::flip_0O(WERD_RES *word_res) { best_choice->unichar_id(i) == unichar_O) { /* A0A */ if ((i+1) < best_choice->length() && - non_O_upper(best_choice->unichar_id(i-1)) && - non_O_upper(best_choice->unichar_id(i+1))) { + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_O, i); modified = true; } /* A00A */ - if (non_O_upper(best_choice->unichar_id(i-1)) && + if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && (i+2) < best_choice->length() && - non_O_upper(best_choice->unichar_id(i+2))) { + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) { best_choice->set_unichar_id(unichar_O, i); modified = true; i++; } /* AA0 */ if ((i > 1) && - non_O_upper(best_choice->unichar_id(i-2)) && - non_O_upper(best_choice->unichar_id(i-1)) && + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) && + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && (((i+1) < best_choice->length() && - !unicharset.get_isdigit(best_choice->unichar_id(i+1)) && - !unicharset.eq(best_choice->unichar_id(i+1), "l") && - !unicharset.eq(best_choice->unichar_id(i+1), "I")) || + !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) && + !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") && + !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) || (i == best_choice->length() - 1))) { best_choice->set_unichar_id(unichar_O, i); modified = true; } /* 9O9 */ - if (non_0_digit(best_choice->unichar_id(i-1)) && + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && - non_0_digit(best_choice->unichar_id(i+1))) { + non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_0, i); modified = true; } /* 9OOO */ - if (non_0_digit(best_choice->unichar_id(i-1)) && + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+2) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && @@ -940,27 +938,27 @@ void Tesseract::flip_0O(WERD_RES *word_res) { i += 2; } /* 9OO */ - if (non_0_digit(best_choice->unichar_id(i-1)) && + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+2) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && - !unicharset.get_isupper(best_choice->unichar_id(i+2))) { + !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) { best_choice->set_unichar_id(unichar_0, i); best_choice->set_unichar_id(unichar_0, i+1); modified = true; i++; } /* 9O */ - if (non_0_digit(best_choice->unichar_id(i-1)) && + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && - !unicharset.get_isupper(best_choice->unichar_id(i+1))) { + !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_0, i); } /* 9[.,]OOO.. */ if ((i > 1) && - (unicharset.eq(best_choice->unichar_id(i-1), ".") || - unicharset.eq(best_choice->unichar_id(i-1), ",")) && - (unicharset.get_isdigit(best_choice->unichar_id(i-2)) || + (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") || + word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) && + (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) || best_choice->unichar_id(i-2) == unichar_O)) { if (best_choice->unichar_id(i-2) == unichar_O) { best_choice->set_unichar_id(unichar_0, i-2); @@ -978,17 +976,15 @@ void Tesseract::flip_0O(WERD_RES *word_res) { } } if (modified) { - best_choice->populate_unichars(unicharset); + best_choice->populate_unichars(); } } -BOOL8 Tesseract::non_O_upper(UNICHAR_ID unichar_id) { - return (unicharset.get_isupper(unichar_id) && - (!unicharset.eq(unichar_id, "O"))); +BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { + return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); } -BOOL8 Tesseract::non_0_digit(UNICHAR_ID unichar_id) { - return (unicharset.get_isdigit(unichar_id) && - (!unicharset.eq(unichar_id, "0"))); +BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { + return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0"); } } // namespace tesseract diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp index b7b975142a..644db54c54 100644 --- a/ccmain/tessedit.cpp +++ b/ccmain/tessedit.cpp @@ -64,7 +64,8 @@ namespace tesseract { // Read a "config" file containing a set of variable, value pairs. // Searches the standard places: tessdata/configs, tessdata/tessconfigs // and also accepts a relative or absolute path name. -void Tesseract::read_config_file(const char *filename, bool init_only) { +void Tesseract::read_config_file(const char *filename, + SetParamConstraint constraint) { STRING path = datadir; path += "configs/"; path += filename; @@ -81,7 +82,7 @@ void Tesseract::read_config_file(const char *filename, bool init_only) { path = filename; } } - ParamUtils::ReadParamsFile(path.string(), init_only, this->params()); + ParamUtils::ReadParamsFile(path.string(), constraint, this->params()); } // Returns false if a unicharset file for the specified language was not found @@ -99,7 +100,7 @@ bool Tesseract::init_tesseract_lang_data( OcrEngineMode oem, char **configs, int configs_size, const GenericVector *vars_vec, const GenericVector *vars_values, - bool set_only_init_params) { + bool set_only_non_debug_params) { // Set the basename, compute the data directory. main_setup(arg0, textbase); @@ -121,17 +122,19 @@ bool Tesseract::init_tesseract_lang_data( ParamUtils::ReadParamsFromFp( tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), - false, this->params()); + SET_PARAM_CONSTRAINT_NONE, this->params()); if (tessdata_manager_debug_level) { tprintf("Loaded language config file\n"); } } + SetParamConstraint set_params_constraint = set_only_non_debug_params ? + SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; // Load tesseract variables from config files. This is done after loading // language-specific variables from [lang].traineddata file, so that custom // config files can override values in [lang].traineddata file. for (int i = 0; i < configs_size; ++i) { - read_config_file(configs[i], set_only_init_params); + read_config_file(configs[i], set_params_constraint); } // Set params specified in vars_vec (done after setting params from config @@ -140,7 +143,7 @@ bool Tesseract::init_tesseract_lang_data( for (int i = 0; i < vars_vec->size(); ++i) { if (!ParamUtils::SetParam((*vars_vec)[i].string(), (*vars_values)[i].string(), - set_only_init_params, this->params())) { + set_params_constraint, this->params())) { tprintf("Error setting param %s\n", (*vars_vec)[i].string()); exit(1); } @@ -169,6 +172,15 @@ bool Tesseract::init_tesseract_lang_data( static_cast(tessedit_ocr_engine_mode)); } + // If we are only loading the config file (and so not planning on doing any + // recognition) then there's nothing else do here. + if (tessedit_init_config_only) { + if (tessdata_manager_debug_level) { + tprintf("Returning after loading config file\n"); + } + return true; + } + // Load the unicharset if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { @@ -178,8 +190,8 @@ bool Tesseract::init_tesseract_lang_data( tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); return false; } - right_to_left_ = unicharset.any_right_to_left(); if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); + right_to_left_ = unicharset.major_right_to_left(); if (!tessedit_ambigs_training && tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { @@ -204,17 +216,151 @@ bool Tesseract::init_tesseract_lang_data( return true; } +// Helper returns true if the given string is in the vector of strings. +static bool IsStrInList(const STRING& str, + const GenericVector& str_list) { + for (int i = 0; i < str_list.size(); ++i) { + if (str_list[i] == str) + return true; + } + return false; +} + +// Parse a string of the form [~][+[~]]*. +// Langs with no prefix get appended to to_load, provided they +// are not in there already. +// Langs with ~ prefix get appended to not_to_load, provided they are not in +// there already. +void Tesseract::ParseLanguageString(const char* lang_str, + GenericVector* to_load, + GenericVector* not_to_load) { + STRING remains(lang_str); + while (remains.length() > 0) { + // Find the start of the lang code and which vector to add to. + const char* start = remains.string(); + while (*start == '+') + ++start; + GenericVector* target = to_load; + if (*start == '~') { + target = not_to_load; + ++start; + } + // Find the index of the end of the lang code in string start. + int end = strlen(start); + const char* plus = strchr(start, '+'); + if (plus != NULL && plus - start < end) + end = plus - start; + STRING lang_code(start); + lang_code.truncate_at(end); + STRING next(start + end); + remains = next; + // Check whether lang_code is already in the target vector and add. + if (!IsStrInList(lang_code, *target)) { + if (tessdata_manager_debug_level) + tprintf("Adding language '%s' to list\n", lang_code.string()); + target->push_back(lang_code); + } + } +} + +// Initialize for potentially a set of languages defined by the language +// string and recursively any additional languages required by any language +// traineddata file (via tessedit_load_sublangs in its config) that is loaded. +// See init_tesseract_internal for args. int Tesseract::init_tesseract( const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector *vars_vec, const GenericVector *vars_values, - bool set_only_init_params) { + bool set_only_non_debug_params) { + GenericVector langs_to_load; + GenericVector langs_not_to_load; + ParseLanguageString(language, &langs_to_load, &langs_not_to_load); + + sub_langs_.delete_data_pointers(); + sub_langs_.clear(); + // Find the first loadable lang and load into this. + // Add any languages that this language requires + bool loaded_primary = false; + // Load the rest into sub_langs_. + for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { + if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { + const char *lang_str = langs_to_load[lang_index].string(); + Tesseract *tess_to_init; + if (!loaded_primary) { + tess_to_init = this; + } else { + tess_to_init = new Tesseract; + } + + int result = tess_to_init->init_tesseract_internal( + arg0, textbase, lang_str, oem, configs, configs_size, + vars_vec, vars_values, set_only_non_debug_params); + + if (!loaded_primary) { + if (result < 0) { + tprintf("Failed loading language '%s'\n", lang_str); + } else { + if (tessdata_manager_debug_level) + tprintf("Loaded language '%s' as main language\n", lang_str); + ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), + &langs_to_load, &langs_not_to_load); + loaded_primary = true; + } + } else { + if (result < 0) { + tprintf("Failed loading language '%s'\n", lang_str); + delete tess_to_init; + } else { + if (tessdata_manager_debug_level) + tprintf("Loaded language '%s' as secondary language\n", lang_str); + sub_langs_.push_back(tess_to_init); + // Add any languages that this language requires + ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), + &langs_to_load, &langs_not_to_load); + } + } + } + } + if (!loaded_primary) { + tprintf("Tesseract couldn't load any languages!\n"); + return -1; // Couldn't load any language! + } + SetupUniversalFontIds(); + return 0; +} + +// Common initialization for a single language. +// arg0 is the datapath for the tessdata directory, which could be the +// path of the tessdata directory with no trailing /, or (if tessdata +// lives in the same directory as the executable, the path of the executable, +// hence the name arg0. +// textbase is an optional output file basename (used only for training) +// language is the language code to load. +// oem controls which engine(s) will operate on the image +// configs (argv) is an array of config filenames to load variables from. +// May be NULL. +// configs_size (argc) is the number of elements in configs. +// vars_vec is an optional vector of variables to set. +// vars_values is an optional corresponding vector of values for the variables +// in vars_vec. +// If set_only_init_params is true, then only the initialization variables +// will be set. +int Tesseract::init_tesseract_internal( + const char *arg0, const char *textbase, const char *language, + OcrEngineMode oem, char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_non_debug_params) { if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, vars_vec, vars_values, - set_only_init_params)) { + set_only_non_debug_params)) { return -1; } + if (tessedit_init_config_only) { + tessdata_manager.End(); + return 0; + } // If only Cube will be used, skip loading Tesseract classifier's // pre-trained templates. bool init_tesseract_classifier = @@ -230,6 +376,46 @@ int Tesseract::init_tesseract( return 0; //Normal exit } +// Helper builds the all_fonts table by adding new fonts from new_fonts. +static void CollectFonts(const UnicityTable& new_fonts, + UnicityTable* all_fonts) { + for (int i = 0; i < new_fonts.size(); ++i) { + // UnicityTable uniques as we go. + all_fonts->push_back(new_fonts.get(i)); + } +} + +// Helper assigns an id to lang_fonts using the index in all_fonts table. +static void AssignIds(const UnicityTable& all_fonts, + UnicityTable* lang_fonts) { + for (int i = 0; i < lang_fonts->size(); ++i) { + int index = all_fonts.get_id(lang_fonts->get(i)); + lang_fonts->get_mutable(i)->universal_id = index; + } +} + +// Set the universal_id member of each font to be unique among all +// instances of the same font loaded. +void Tesseract::SetupUniversalFontIds() { + // Note that we can get away with bitwise copying FontInfo in + // all_fonts, as it is a temporary structure and we avoid setting the + // delete callback. + UnicityTable all_fonts; + all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo)); + + // Create the universal ID table. + CollectFonts(get_fontinfo_table(), &all_fonts); + for (int i = 0; i < sub_langs_.size(); ++i) { + CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts); + } + // Assign ids from the table to each font table. + AssignIds(all_fonts, &get_fontinfo_table()); + for (int i = 0; i < sub_langs_.size(); ++i) { + AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table()); + } + font_table_size_ = all_fonts.size(); +} + // init the LM component int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase, diff --git a/ccmain/tesseract_cube_combiner.cpp b/ccmain/tesseract_cube_combiner.cpp index d0524b6d61..7fd7c6b198 100644 --- a/ccmain/tesseract_cube_combiner.cpp +++ b/ccmain/tesseract_cube_combiner.cpp @@ -148,8 +148,7 @@ bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str, bool cube_best_bigram_cost_valid = true; if (cube_cntxt_->Bigrams()) cube_best_bigram_cost = cube_cntxt_->Bigrams()-> - Cost(cube_best_str32, cube_cntxt_->CharacterSet(), - &cube_cntxt_->TesseractObject()->unicharset); + Cost(cube_best_str32, cube_cntxt_->CharacterSet()); else cube_best_bigram_cost_valid = false; CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str); @@ -191,8 +190,7 @@ bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str, int tess_bigram_cost_valid = true; if (cube_cntxt_->Bigrams()) tess_bigram_cost = cube_cntxt_->Bigrams()-> - Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet(), - &cube_cntxt_->TesseractObject()->unicharset); + Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet()); else tess_bigram_cost_valid = false; diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index 62e0169e08..8d1a7f0f4b 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -19,17 +19,19 @@ /////////////////////////////////////////////////////////////////////// #include "tesseractclass.h" + +#include "allheaders.h" #include "cube_reco_context.h" -#include "tesseract_cube_combiner.h" +#include "edgblob.h" +#include "equationdetect.h" #include "globals.h" +#include "tesseract_cube_combiner.h" // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif -#include "allheaders.h" - namespace tesseract { Tesseract::Tesseract() @@ -63,7 +65,7 @@ Tesseract::Tesseract() "Blacklist of chars not to recognize", this->params()), STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params()), - BOOL_INIT_MEMBER(tessedit_ambigs_training, false, + BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities", this->params()), INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT, @@ -80,6 +82,7 @@ Tesseract::Tesseract() " a character composed form fragments", this->params()), BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug" " information for adaption", this->params()), + INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params()), @@ -94,7 +97,7 @@ Tesseract::Tesseract() BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box" " is assumed to contain ngrams. Only learn the ngrams" " whose outlines overlap horizontally.", this->params()), - BOOL_MEMBER(tessedit_draw_outwords, false, + BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params()), BOOL_MEMBER(tessedit_training_tess, false, "Call Tess to learn blobs", this->params()), @@ -114,6 +117,12 @@ Tesseract::Tesseract() "Output font info per char", this->params()), BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params()), + BOOL_MEMBER(tessedit_enable_bigram_correction, false, + "Enable correction based on the word bigram dictionary.", + this->params()), + INT_MEMBER(tessedit_bigram_debug, 0, + "Amount of debug output for bigram correction.", + this->params()), INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk", this->params()), @@ -145,13 +154,15 @@ Tesseract::Tesseract() "Log matcher activity", this->params()), INT_MEMBER(tessedit_test_adaption_mode, 3, "Adaptation decision algorithm for tess", this->params()), - BOOL_MEMBER(save_best_choices, false, + BOOL_MEMBER(save_blob_choices, false, "Save the results of the recognition step (blob_choices)" " within the corresponding WERD_CHOICE", this->params()), BOOL_MEMBER(test_pt, false, "Test for point", this->params()), double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), - INT_MEMBER(cube_debug_level, 1, "Print cube debug info.", this->params()), + INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", + this->params()), + INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params()), STRING_MEMBER(outlines_2, "ij!?%\":;", @@ -345,28 +356,49 @@ Tesseract::Tesseract() " , else specifc page to process", this->params()), BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params()), - BOOL_MEMBER(interactive_mode, false, "Run interactively?", this->params()), + BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", + this->params()), STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params()), - INT_INIT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for" - " TessdataManager functions.", this->params()), - double_MEMBER(min_orientation_margin, 12.0, + INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for" + " TessdataManager functions.", this->params()), + STRING_MEMBER(tessedit_load_sublangs, "", + "List of languages to load with this one", this->params()), + double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin", this->params()), + BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", + this->params()), + BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", + this->params()), + BOOL_INIT_MEMBER(tessedit_init_config_only, false, + "Only initialize with the config file. Useful if the " + "instance is not going to be used for OCR but say only " + "for layout analysis.", this->params()), + BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", + this->params()), backup_config_file_(NULL), pix_binary_(NULL), + cube_binary_(NULL), pix_grey_(NULL), - orig_image_changed_(false), + source_resolution_(0), textord_(this), right_to_left_(false), + scaled_color_(NULL), + scaled_factor_(-1), deskew_(1.0f, 0.0f), reskew_(1.0f, 0.0f), + most_recently_used_(this), + font_table_size_(0), cube_cntxt_(NULL), - tess_cube_combiner_(NULL) { + tess_cube_combiner_(NULL), + equ_detect_(NULL) { } Tesseract::~Tesseract() { Clear(); + end_tesseract(); + sub_langs_.delete_data_pointers(); // Delete cube objects. if (cube_cntxt_ != NULL) { delete cube_cntxt_; @@ -379,77 +411,124 @@ Tesseract::~Tesseract() { } void Tesseract::Clear() { - if (pix_binary_ != NULL) - pixDestroy(&pix_binary_); - if (pix_grey_ != NULL) - pixDestroy(&pix_grey_); + pixDestroy(&pix_binary_); + pixDestroy(&cube_binary_); + pixDestroy(&pix_grey_); + pixDestroy(&scaled_color_); deskew_ = FCOORD(1.0f, 0.0f); reskew_ = FCOORD(1.0f, 0.0f); - orig_image_changed_ = false; splitter_.Clear(); + scaled_factor_ = -1; + ResetFeaturesHaveBeenExtracted(); + for (int i = 0; i < sub_langs_.size(); ++i) + sub_langs_[i]->Clear(); +} + +void Tesseract::SetEquationDetect(EquationDetect* detector) { + equ_detect_ = detector; + equ_detect_->SetLangTesseract(this); +} + +// Clear all memory of adaption for this and all subclassifiers. +void Tesseract::ResetAdaptiveClassifier() { + ResetAdaptiveClassifierInternal(); + for (int i = 0; i < sub_langs_.size(); ++i) { + sub_langs_[i]->ResetAdaptiveClassifierInternal(); + } +} + +// Clear the document dictionary for this and all subclassifiers. +void Tesseract::ResetDocumentDictionary() { + getDict().ResetDocumentDictionary(); + for (int i = 0; i < sub_langs_.size(); ++i) { + sub_langs_[i]->getDict().ResetDocumentDictionary(); + } } void Tesseract::SetBlackAndWhitelist() { // Set the white and blacklists (if any) unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(), tessedit_char_whitelist.string()); + // Black and white lists should apply to all loaded classifiers. + for (int i = 0; i < sub_langs_.size(); ++i) { + sub_langs_[i]->unicharset.set_black_and_whitelist( + tessedit_char_blacklist.string(), tessedit_char_whitelist.string()); + } } // Perform steps to prepare underlying binary image/other data structures for // page segmentation. void Tesseract::PrepareForPageseg() { + textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model); + pixDestroy(&cube_binary_); + cube_binary_ = pixClone(pix_binary()); + // Find the max splitter strategy over all langs. + ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy = + static_cast( + static_cast(pageseg_devanagari_split_strategy)); + for (int i = 0; i < sub_langs_.size(); ++i) { + ShiroRekhaSplitter::SplitStrategy pageseg_strategy = + static_cast( + static_cast(sub_langs_[i]->pageseg_devanagari_split_strategy)); + if (pageseg_strategy > max_pageseg_strategy) + max_pageseg_strategy = pageseg_strategy; + // Clone the cube image to all the sub langs too. + pixDestroy(&sub_langs_[i]->cube_binary_); + sub_langs_[i]->cube_binary_ = pixClone(pix_binary()); + pixDestroy(&sub_langs_[i]->pix_binary_); + sub_langs_[i]->pix_binary_ = pixClone(pix_binary()); + } // Perform shiro-rekha (top-line) splitting and replace the current image by // the newly splitted image. splitter_.set_orig_pix(pix_binary()); - splitter_.set_pageseg_split_strategy( - (ShiroRekhaSplitter::SplitStrategy) - ((inT32)pageseg_devanagari_split_strategy)); + splitter_.set_pageseg_split_strategy(max_pageseg_strategy); if (splitter_.Split(true)) { ASSERT_HOST(splitter_.splitted_image()); - splitter_.CopySplittedImageTo(NULL, &pix_binary_); - orig_image_changed_ = true; + pixDestroy(&pix_binary_); + pix_binary_ = pixClone(splitter_.splitted_image()); } } // Perform steps to prepare underlying binary image/other data structures for // OCR. The current segmentation is required by this method. +// Note that this method resets pix_binary_ to the original binarized image, +// which may be different from the image actually used for OCR depending on the +// value of devanagari_ocr_split_strategy. void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list, Tesseract* osd_tess, OSResults* osr) { - // Creating blobs to OCR. + // Find the max splitter strategy over all langs. + ShiroRekhaSplitter::SplitStrategy max_ocr_strategy = + static_cast( + static_cast(ocr_devanagari_split_strategy)); + for (int i = 0; i < sub_langs_.size(); ++i) { + ShiroRekhaSplitter::SplitStrategy ocr_strategy = + static_cast( + static_cast(sub_langs_[i]->ocr_devanagari_split_strategy)); + if (ocr_strategy > max_ocr_strategy) + max_ocr_strategy = ocr_strategy; + } // Utilize the segmentation information available. splitter_.set_segmentation_block_list(block_list); - splitter_.set_ocr_split_strategy( - (ShiroRekhaSplitter::SplitStrategy) - ((inT32)ocr_devanagari_split_strategy)); - if (splitter_.Split(false)) { - ASSERT_HOST(splitter_.splitted_image()); - splitter_.CopySplittedImageTo(NULL, &pix_binary_); - orig_image_changed_ = true; - // If the split strategies used before pageseg and ocr are the same, the - // segmentation obtained from the second round can be used going forward. - // Otherwise, the page-segmentation (& importantly, the word segmentation) - // of first round is used. - if (splitter_.HasDifferentSplitStrategies()) { - // Refresh the segmentation with new blobs. - BLOCK_LIST new_segmentation; - SegmentPage(NULL, &new_segmentation, osd_tess, osr); - C_BLOB_LIST new_blobs; - ExtractBlobsFromSegmentation(&new_segmentation, &new_blobs); - splitter_.RefreshSegmentationWithNewBlobs(&new_blobs); - } else { - block_list->clear(); - SegmentPage(NULL, block_list, osd_tess, osr); - } + splitter_.set_ocr_split_strategy(max_ocr_strategy); + // Run the splitter for OCR + bool split_for_ocr = splitter_.Split(false); + // Restore pix_binary to the binarized original pix for future reference. + ASSERT_HOST(splitter_.orig_pix()); + pixDestroy(&pix_binary_); + pix_binary_ = pixClone(splitter_.orig_pix()); + // If the pageseg and ocr strategies are different, refresh the block list + // (from the last SegmentImage call) with blobs from the real image to be used + // for OCR. + if (splitter_.HasDifferentSplitStrategies()) { + BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_), + pixGetHeight(pix_binary_)); + Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : + splitter_.orig_pix(); + extract_edges(pix_for_ocr, &block); + splitter_.RefreshSegmentationWithNewBlobs(block.blob_list()); } + // The splitter isn't needed any more after this, so save memory by clearing. + splitter_.Clear(); } -// Perform steps to prepare underlying binary image/other data structures for -// Cube OCR. -void Tesseract::PrepareForCubeOCR() { - if (orig_image_changed_) { - // Revert to the original image as Cube likes them more. - splitter_.CopyOriginalImageTo(NULL, &pix_binary_); - orig_image_changed_ = false; - } -} } // namespace tesseract diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 78ab31e006..11ff7033a5 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -22,14 +22,14 @@ #define TESSERACT_CCMAIN_TESSERACTCLASS_H__ #include "allheaders.h" -#include "genericvector.h" -#include "params.h" -#include "wordrec.h" -#include "ocrclass.h" #include "control.h" #include "docqual.h" #include "devanagari_processing.h" +#include "genericvector.h" +#include "params.h" +#include "ocrclass.h" #include "textord.h" +#include "wordrec.h" class PAGE_RES; class PAGE_RES_IT; @@ -92,11 +92,18 @@ struct OSResults; namespace tesseract { +class ColumnFinder; class CubeLineObject; class CubeObject; class CubeRecoContext; +class EquationDetect; +class Tesseract; class TesseractCubeCombiner; +typedef void (Tesseract::*WordRecognizer)(BLOCK* block, + ROW *row, + WERD_RES *word); + // A collection of various variables for statistics and debugging. struct TesseractStats { TesseractStats() @@ -134,7 +141,16 @@ class Tesseract : public Wordrec { Tesseract(); ~Tesseract(); + // Clear as much used memory as possible without resetting the adaptive + // classifier or losing any other classifier data. void Clear(); + // Clear all memory of adaption for this and all subclassifiers. + void ResetAdaptiveClassifier(); + // Clear the document dictionary for this and all subclassifiers. + void ResetDocumentDictionary(); + + // Set the equation detector. + void SetEquationDetect(EquationDetect* detector); // Simple accessors. const FCOORD& reskew() const { @@ -152,20 +168,40 @@ class Tesseract : public Wordrec { return pix_grey_; } void set_pix_grey(Pix* grey_pix) { + pixDestroy(&pix_grey_); pix_grey_ = grey_pix; } + // Returns a pointer to a Pix representing the best available image of the + // page. The image will be 8-bit grey if the input was grey or color. Note + // that in grey 0 is black and 255 is white. If the input was binary, then + // the returned Pix will be binary. Note that here black is 1 and white is 0. + // To tell the difference pixGetDepth() will return 8 or 1. + // In either case, the return value is a borrowed Pix, and should not be + // deleted or pixDestroyed. + Pix* BestPix() const { + return pix_grey_ != NULL ? pix_grey_ : pix_binary_; + } + int source_resolution() const { + return source_resolution_; + } + void set_source_resolution(int ppi) { + source_resolution_ = ppi; + } int ImageWidth() const { return pixGetWidth(pix_binary_); } int ImageHeight() const { return pixGetHeight(pix_binary_); } - - const ShiroRekhaSplitter& splitter() const { - return splitter_; + Pix* scaled_color() const { + return scaled_color_; } - ShiroRekhaSplitter* mutable_splitter() { - return &splitter_; + int scaled_factor() const { + return scaled_factor_; + } + void SetScaledColor(int factor, Pix* color) { + scaled_factor_ = factor; + scaled_color_ = color; } const Textord& textord() const { return textord_; @@ -177,6 +213,12 @@ class Tesseract : public Wordrec { bool right_to_left() const { return right_to_left_; } + int num_sub_langs() const { + return sub_langs_.size(); + } + Tesseract* get_sub_lang(int index) const { + return sub_langs_[index]; + } void SetBlackAndWhitelist(); @@ -194,81 +236,87 @@ class Tesseract : public Wordrec { void PrepareForTessOCR(BLOCK_LIST* block_list, Tesseract* osd_tess, OSResults* osr); - // Perform steps to prepare underlying binary image/other data structures for - // Cube OCR. - void PrepareForCubeOCR(); - int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr); void SetupWordScripts(BLOCK_LIST* blocks); - int AutoPageSeg(int resolution, bool single_column, - bool osd, bool only_osd, + int AutoPageSeg(bool single_column, bool osd, bool only_osd, BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, Tesseract* osd_tess, OSResults* osr); + ColumnFinder* SetupPageSegAndDetectOrientation( + bool single_column, bool osd, bool only_osd, + BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, + TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix); //// control.h ///////////////////////////////////////////////////////// bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box, const char* word_config, int pass); - void recog_all_words(PAGE_RES* page_res, + bool recog_all_words(PAGE_RES* page_res, ETEXT_DESC* monitor, const TBOX* target_word_box, const char* word_config, int dopasses); - void classify_word_pass1( //recog one word - WERD_RES *word, //word to do - ROW *row, - BLOCK* block); + void rejection_passes(PAGE_RES* page_res, + ETEXT_DESC* monitor, + const TBOX* target_word_box, + const char* word_config); + void bigram_correction_pass(PAGE_RES *page_res); + void blamer_pass(PAGE_RES* page_res); + // Helper to recognize the word using the given (language-specific) tesseract. + // Returns true if the result was better than previously. + bool RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row, + WordRecognizer recognizer); + void classify_word_and_language(WordRecognizer recognizer, + BLOCK* block, ROW *row, WERD_RES *word); + void classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word); void recog_pseudo_word(PAGE_RES* page_res, // blocks to check TBOX &selection_box); void fix_rep_char(PAGE_RES_IT* page_res_it); void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it); - // Callback helper for fix_quotes returns a double quote if both - // arguments are quote, otherwise INVALID_UNICHAR_ID. - UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2); - void fix_quotes(WERD_RES* word_res, - BLOB_CHOICE_LIST_CLIST *blob_choices); - ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s, + ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set, + const char *s, const char *lengths); void match_word_pass2( //recog one word WERD_RES *word, //word to do ROW *row, BLOCK* block); - void classify_word_pass2( //word to do - WERD_RES *word, - BLOCK* block, - ROW *row); + void classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word); void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES* word, WERD_RES* new_word); bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row); bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row); BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res); - // Callback helper for fix_hyphens returns UNICHAR_ID of - if both - // arguments are hyphen, otherwise INVALID_UNICHAR_ID. - UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2); - // Callback helper for fix_hyphens returns true if box1 and box2 overlap - // (assuming both on the same textline, are in order and a chopped em dash.) - bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2); - void fix_hyphens(WERD_RES* word_res, - BLOB_CHOICE_LIST_CLIST *blob_choices); void set_word_fonts( WERD_RES *word, // set fonts of this word BLOB_CHOICE_LIST_CLIST *blob_choices); // detailed results - void font_recognition_pass( //good chars in word - PAGE_RES_IT &page_res_it); + void font_recognition_pass(PAGE_RES* page_res); BOOL8 check_debug_pt(WERD_RES *word, int location); //// cube_control.cpp /////////////////////////////////////////////////// bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager); - void run_cube(PAGE_RES *page_res); - void cube_recognize(CubeObject *cube_obj, PAGE_RES_IT *page_res_it); + // Iterates through tesseract's results and calls cube on each word, + // combining the results with the existing tesseract result. + void run_cube_combiner(PAGE_RES *page_res); + // Recognizes a single word using (only) cube. Compatible with + // Tesseract's classify_word_pass1/classify_word_pass2. + void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word); + // Cube recognizer to recognize a single word as with classify_word_pass1 + // but also returns the cube object in case the combiner is needed. + CubeObject* cube_recognize_word(BLOCK* block, WERD_RES* word); + // Combines the cube and tesseract results for a single word, leaving the + // result in tess_word. + void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word, + WERD_RES* tess_word); + // Call cube on the current word, and write the result to word. + // Sets up a fake result and returns false if something goes wrong. + bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word); void fill_werd_res(const BoxWord& cube_box_word, WERD_CHOICE* cube_werd_choice, const char* cube_best_str, - PAGE_RES_IT *page_res_it); + WERD_RES* tess_werd_res); bool extract_cube_state(CubeObject* cube_obj, int* num_chars, Boxa** char_boxes, CharSamp*** char_samples); bool create_cube_box_word(Boxa *char_boxes, int num_chars, @@ -287,7 +335,11 @@ class Tesseract : public Wordrec { inT16 count_alphanums(const WERD_CHOICE &word); inT16 count_alphas(const WERD_CHOICE &word); //// tessedit.h //////////////////////////////////////////////////////// - void read_config_file(const char *filename, bool init_only); + void read_config_file(const char *filename, SetParamConstraint constraint); + // Initialize for potentially a set of languages defined by the language + // string and recursively any additional languages required by any language + // traineddata file (via tessedit_load_sublangs in its config) that is loaded. + // See init_tesseract_internal for args. int init_tesseract(const char *arg0, const char *textbase, const char *language, @@ -303,6 +355,35 @@ class Tesseract : public Wordrec { return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL, false); } + // Common initialization for a single language. + // arg0 is the datapath for the tessdata directory, which could be the + // path of the tessdata directory with no trailing /, or (if tessdata + // lives in the same directory as the executable, the path of the executable, + // hence the name arg0. + // textbase is an optional output file basename (used only for training) + // language is the language code to load. + // oem controls which engine(s) will operate on the image + // configs (argv) is an array of config filenames to load variables from. + // May be NULL. + // configs_size (argc) is the number of elements in configs. + // vars_vec is an optional vector of variables to set. + // vars_values is an optional corresponding vector of values for the variables + // in vars_vec. + // If set_only_init_params is true, then only the initialization variables + // will be set. + int init_tesseract_internal(const char *arg0, + const char *textbase, + const char *language, + OcrEngineMode oem, + char **configs, + int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_init_params); + + // Set the universal_id member of each font to be unique among all + // instances of the same font loaded. + void SetupUniversalFontIds(); int init_tesseract_lm(const char *arg0, const char *textbase, @@ -321,6 +402,10 @@ class Tesseract : public Wordrec { const GenericVector *vars_values, bool set_only_init_params); + void ParseLanguageString(const char* lang_str, + GenericVector* to_load, + GenericVector* not_to_load); + //// pgedit.h ////////////////////////////////////////////////////////// SVMenuNode *build_menu_new(); void pgeditor_main(int width, int height, PAGE_RES* page_res); @@ -360,8 +445,8 @@ class Tesseract : public Wordrec { inT16 count_alphanums( //how many alphanums WERD_RES *word); void flip_0O(WERD_RES *word); - BOOL8 non_0_digit(UNICHAR_ID unichar_id); - BOOL8 non_O_upper(UNICHAR_ID unichar_id); + BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id); + BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id); BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row); void nn_match_word( //Match a word WERD_RES *word, @@ -372,7 +457,7 @@ class Tesseract : public Wordrec { void set_done( //set done flag WERD_RES *word, inT16 pass); - inT16 safe_dict_word(const WERD_CHOICE &word); + inT16 safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict? void flip_hyphens(WERD_RES *word); void reject_I_1_L(WERD_RES *word); void reject_edge_blobs(WERD_RES *word); @@ -425,10 +510,6 @@ class Tesseract : public Wordrec { void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc); void convert_bad_unlv_chs(WERD_RES *word_res); - // Callback helper for merge_tess_fails returns a space if both - // arguments are space, otherwise INVALID_UNICHAR_ID. - UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2); - void merge_tess_fails(WERD_RES *word_res); void tilde_delete(PAGE_RES_IT &page_res_it); inT16 word_blob_quality(WERD_RES *word, ROW *row); void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, @@ -487,11 +568,13 @@ class Tesseract : public Wordrec { // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: // All fuzzy spaces are removed, and all the words are maximally chopped. - PAGE_RES* SetupApplyBoxes(BLOCK_LIST *block_list); + PAGE_RES* SetupApplyBoxes(const GenericVector& boxes, + BLOCK_LIST *block_list); // Tests the chopper by exhaustively running chop_one_blob. // The word_res will contain filled chopped_word, seam_array, denorm, // box_word and best_state for the maximally chopped word. - void MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res); + void MaximallyChopWord(const GenericVector& boxes, + BLOCK* block, ROW* row, WERD_RES* word_res); // Gather consecutive blobs that match the given box into the best_state // and corresponding correct_text. // Fights over which box owns which blobs are settled by pre-chopping and @@ -500,7 +583,7 @@ class Tesseract : public Wordrec { // failing to find an appropriate blob for a box. // This means that occasionally, blobs may be incorrectly segmented if the // chopper fails to find a suitable chop point. - bool ResegmentCharBox(PAGE_RES* page_res, + bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, const TBOX& box, const TBOX& next_box, const char* correct_text); // Consume all source blobs that strongly overlap the given box, @@ -602,6 +685,7 @@ class Tesseract : public Wordrec { " a character composed form fragments"); BOOL_VAR_H(tessedit_adaption_debug, false, "Generate and print debug information for adaption"); + INT_VAR_H(bidi_debug, 0, "Debug level for BiDi"); INT_VAR_H(applybox_debug, 1, "Debug level"); INT_VAR_H(applybox_page, 0, "Page number to apply boxes from"); STRING_VAR_H(applybox_exposure_pattern, ".exp", @@ -615,7 +699,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(applybox_learn_ngrams_mode, false, "Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."); - BOOL_VAR_H(tessedit_draw_outwords, false, "Draw output words"); + BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words"); BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs"); BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices"); BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, @@ -628,6 +712,10 @@ class Tesseract : public Wordrec { "Add words to the document dictionary"); BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char"); BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats"); + BOOL_VAR_H(tessedit_enable_bigram_correction, false, + "Enable correction based on the word bigram dictionary."); + INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram " + "correction."); INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug"); BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk"); STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation"); @@ -649,12 +737,13 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity"); INT_VAR_H(tessedit_test_adaption_mode, 3, "Adaptation decision algorithm for tess"); - BOOL_VAR_H(save_best_choices, false, + BOOL_VAR_H(save_blob_choices, false, "Save the results of the recognition step" " (blob_choices) within the corresponding WERD_CHOICE"); BOOL_VAR_H(test_pt, false, "Test for point"); double_VAR_H(test_pt_x, 99999.99, "xcoord"); double_VAR_H(test_pt_y, 99999.99, "ycoord"); + INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info."); INT_VAR_H(cube_debug_level, 1, "Print cube debug info."); STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines"); STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines"); @@ -788,15 +877,23 @@ class Tesseract : public Wordrec { INT_VAR_H(tessedit_page_number, -1, "-1 -> All pages, else specifc page to process"); BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE"); - BOOL_VAR_H(interactive_mode, false, "Run interactively?"); + BOOL_VAR_H(interactive_display_mode, false, "Run interactively?"); STRING_VAR_H(file_type, ".tif", "Filename extension"); BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word"); INT_VAR_H(tessdata_manager_debug_level, 0, "Debug level for TessdataManager functions."); + STRING_VAR_H(tessedit_load_sublangs, "", + "List of languages to load with this one"); // Min acceptable orientation margin (difference in scores between top and 2nd // choice in OSResults::orientations) to believe the page orientation. - double_VAR_H(min_orientation_margin, 12.0, + double_VAR_H(min_orientation_margin, 7.0, "Min acceptable orientation margin"); + BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding"); + BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model"); + BOOL_VAR_H(tessedit_init_config_only, false, + "Only initialize with the config file. Useful if the instance is " + "not going to be used for OCR but say only for layout analysis."); + BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector"); //// ambigsrecog.cpp ///////////////////////////////////////////////////////// FILE *init_recog_training(const STRING &fname); @@ -819,25 +916,40 @@ class Tesseract : public Wordrec { const char* backup_config_file_; // The filename of a config file to read when processing a debug word. STRING word_config_; + // Image used for input to layout analysis and tesseract recognition. + // May be modified by the ShiroRekhaSplitter to eliminate the top-line. Pix* pix_binary_; + // Unmodified image used for input to cube. Always valid. + Pix* cube_binary_; + // Grey-level input image if the input was not binary, otherwise NULL. Pix* pix_grey_; + // Input image resolution after any scaling. The resolution is not well + // transmitted by operations on Pix, so we keep an independent record here. + int source_resolution_; // The shiro-rekha splitter object which is used to split top-lines in // Devanagari words to provide a better word and grapheme segmentation. ShiroRekhaSplitter splitter_; - // The boolean records if the currently set - // pix_binary_ member has been modified due to any processing so that this - // may hurt Cube's recognition phase. - bool orig_image_changed_; // Page segmentation/layout Textord textord_; // True if the primary language uses right_to_left reading order. bool right_to_left_; + Pix* scaled_color_; + int scaled_factor_; FCOORD deskew_; FCOORD reskew_; TesseractStats stats_; + // Sub-languages to be tried in addition to this. + GenericVector sub_langs_; + // Most recently used Tesseract out of this and sub_langs_. The default + // language for the next word. + Tesseract* most_recently_used_; + // The size of the font table, ie max possible font id + 1. + int font_table_size_; // Cube objects. CubeRecoContext* cube_cntxt_; TesseractCubeCombiner *tess_cube_combiner_; + // Equation detector. Note: this pointer is NOT owned by the class. + EquationDetect* equ_detect_; }; } // namespace tesseract diff --git a/ccmain/tfacepp.cpp b/ccmain/tfacepp.cpp index 8f7c74f47a..d845932883 100644 --- a/ccmain/tfacepp.cpp +++ b/ccmain/tfacepp.cpp @@ -23,6 +23,8 @@ #pragma warning(disable:4800) // int/bool warnings #endif +#include + #include "mfcpch.h" #ifdef __UNIX__ #include @@ -58,7 +60,7 @@ void Tesseract::recog_word(WERD_RES *word, (word->best_choice->length() != blob_choices->length())) { tprintf("recog_word ASSERT FAIL String:\"%s\"; " "Strlen=%d; #Blobs=%d; #Choices=%d\n", - word->best_choice->debug_string(unicharset).string(), + word->best_choice->debug_string().string(), word->best_choice->length(), word->box_word->length(), blob_choices->length()); } @@ -128,7 +130,7 @@ void Tesseract::recog_word_recursive(WERD_RES *word, word->raw_choice->append_unichar_id(space_id, 1, 0.0, word->raw_choice->certainty()); } - word->raw_choice->populate_unichars(unicharset); + word->raw_choice->populate_unichars(); } // Do sanity checks and minor fixes on best_choice. @@ -162,7 +164,7 @@ void Tesseract::recog_word_recursive(WERD_RES *word, word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty()); } - word->best_choice->populate_unichars(unicharset); + word->best_choice->populate_unichars(); } } @@ -198,6 +200,7 @@ void Tesseract::split_and_recog_word(WERD_RES *word, prev_blob = blob; } ASSERT_HOST(best_end != NULL); + ASSERT_HOST(best_end->next != NULL); // Make a copy of the word to put the 2nd half in. WERD_RES* word2 = new WERD_RES(*word); @@ -211,6 +214,67 @@ void Tesseract::split_and_recog_word(WERD_RES *word, free_seam_list(word->seam_array); word->seam_array = start_seam_list(word->chopped_word->blobs); word2->seam_array = start_seam_list(word2->chopped_word->blobs); + BlamerBundle *orig_bb = word->blamer_bundle; + STRING blamer_debug; + // Try to adjust truth information. + if (orig_bb != NULL) { + // Find truth boxes that correspond to the split in the blobs. + int b; + int begin2_truth_index = -1; + if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH && + orig_bb->truth_has_char_boxes) { + int end1_x = best_end->bounding_box().right(); + int begin2_x = word2->chopped_word->blobs->bounding_box().left(); + blamer_debug = "Looking for truth split at"; + blamer_debug.add_str_int(" end1_x ", end1_x); + blamer_debug.add_str_int(" begin2_x ", begin2_x); + blamer_debug += "\nnorm_truth_word boxes:\n"; + if (orig_bb->norm_truth_word.length() > 1) { + orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug); + for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) { + orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug); + if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) < + orig_bb->norm_box_tolerance) && + (abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) < + orig_bb->norm_box_tolerance)) { + begin2_truth_index = b; + blamer_debug += "Split found\n"; + break; + } + } + } + } + // Populate truth information in word and word2 with the first and second + // part of the original truth. + word->blamer_bundle = new BlamerBundle(); + word2->blamer_bundle = new BlamerBundle(); + if (begin2_truth_index > 0) { + word->blamer_bundle->truth_has_char_boxes = true; + word->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance; + word2->blamer_bundle->truth_has_char_boxes = true; + word2->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance; + BlamerBundle *curr_bb = word->blamer_bundle; + for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) { + if (b == begin2_truth_index) curr_bb = word2->blamer_bundle; + curr_bb->norm_truth_word.InsertBox( + b, orig_bb->norm_truth_word.BlobBox(b)); + curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b)); + curr_bb->truth_text.push_back(orig_bb->truth_text[b]); + } + } else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) { + word->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH; + word2->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH; + } else { + blamer_debug += "Truth split not found"; + blamer_debug += orig_bb->truth_has_char_boxes ? + "\n" : " (no truth char boxes)\n"; + word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug, + NULL, wordrec_debug_blamer); + word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug, + NULL, wordrec_debug_blamer); + } + } + // Recognize the first part of the word. recog_word_recursive(word, blob_choices); // Recognize the second part of the word. @@ -239,6 +303,75 @@ void Tesseract::split_and_recog_word(WERD_RES *word, // Append the word choices. *word->best_choice += *word2->best_choice; *word->raw_choice += *word2->raw_choice; + + // How many alt choices from each should we try to get? + const int kAltsPerPiece = 2; + // When do we start throwing away extra alt choices? + const int kTooManyAltChoices = 100; + + if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) { + // Construct the cartesian product of the alt choices of word(1) and word2. + int num_first_alt_choices = word->alt_choices.size(); + // Nota Bene: For the main loop here, we leave in place word1-only + // alt_choices in + // word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1] + // These will get fused with the best choices for word2 below. + for (int j = 1; j < word2->alt_choices.size() && + (j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices); + j++) { + for (int i = 0; i < num_first_alt_choices && + (i <= kAltsPerPiece || + word->alt_choices.size() < kTooManyAltChoices); + i++) { + WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]); + *wc += *word2->alt_choices[j]; + word->alt_choices.push_back(wc); + + word->alt_states.push_back(GenericVector()); + GenericVector &alt_state = word->alt_states.back(); + alt_state += word->alt_states[i]; + alt_state += word2->alt_states[j]; + } + } + // Now that we've filled in as many alternates as we want, paste the best + // choice for word2 onto the original word alt_choices. + for (int i = 0; i < num_first_alt_choices; i++) { + *word->alt_choices[i] += *word2->alt_choices[0]; + word->alt_states[i] += word2->alt_states[0]; + } + } + + // Restore the pointer to original blamer bundle and combine blamer + // information recorded in the splits. + if (orig_bb != NULL) { + IncorrectResultReason irr = orig_bb->incorrect_result_reason; + if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = ""; + if (word->blamer_bundle->incorrect_result_reason != IRR_CORRECT && + word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH && + word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) { + blamer_debug += "Blame from part 1: "; + blamer_debug += word->blamer_bundle->debug; + irr = word->blamer_bundle->incorrect_result_reason; + } + if (word2->blamer_bundle->incorrect_result_reason != IRR_CORRECT && + word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH && + word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) { + blamer_debug += "Blame from part 2: "; + blamer_debug += word2->blamer_bundle->debug; + if (irr == IRR_CORRECT) { + irr = word2->blamer_bundle->incorrect_result_reason; + } else if (irr != word2->blamer_bundle->incorrect_result_reason) { + irr = IRR_UNKNOWN; + } + } + delete word->blamer_bundle; + word->blamer_bundle = orig_bb; + word->blamer_bundle->incorrect_result_reason = irr; + if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) { + word->blamer_bundle->SetBlame(irr, blamer_debug, NULL, + wordrec_debug_blamer); + } + } delete word2; } diff --git a/ccmain/thresholder.cpp b/ccmain/thresholder.cpp index f8ae740d1b..c4b76d9560 100644 --- a/ccmain/thresholder.cpp +++ b/ccmain/thresholder.cpp @@ -33,7 +33,7 @@ ImageThresholder::ImageThresholder() image_data_(NULL), image_width_(0), image_height_(0), image_bytespp_(0), image_bytespl_(0), - scale_(1), yres_(300) { + scale_(1), yres_(300), estimated_res_(300) { SetRectangle(0, 0, 0, 0); } @@ -79,7 +79,7 @@ void ImageThresholder::SetImage(const unsigned char* imagedata, image_bytespp_ = bytes_per_pixel; image_bytespl_ = bytes_per_line; scale_ = 1; - yres_ = 300; + estimated_res_ = yres_ = 300; Init(); } @@ -129,7 +129,7 @@ void ImageThresholder::SetImage(const Pix* pix) { image_bytespp_ = depth / 8; image_bytespl_ = pixGetWpl(pix_) * sizeof(l_uint32); scale_ = 1; - yres_ = pixGetYRes(src); + estimated_res_ = yres_ = pixGetYRes(src); Init(); } diff --git a/ccmain/thresholder.h b/ccmain/thresholder.h index 7e21e259ab..a22cf0c561 100644 --- a/ccmain/thresholder.h +++ b/ccmain/thresholder.h @@ -79,12 +79,33 @@ class ImageThresholder { int GetScaleFactor() const { return scale_; } + + // Set the resolution of the source image in pixels per inch. + // This should be called right after SetImage(), and will let us return + // appropriate font sizes for the text. + void SetSourceYResolution(int ppi) { + yres_ = ppi; + estimated_res_ = ppi; + } int GetSourceYResolution() const { return yres_; } int GetScaledYResolution() const { return scale_ * yres_; } + // Set the resolution of the source image in pixels per inch, as estimated + // by the thresholder from the text size found during thresholding. + // This value will be used to set internal size thresholds during recognition + // and will not influence the output "point size." The default value is + // the same as the source resolution. (yres_) + void SetEstimatedResolution(int ppi) { + estimated_res_ = ppi; + } + // Returns the estimated resolution, including any active scaling. + // This value will be used to set internal size thresholds during recognition. + int GetScaledEstimatedResolution() const { + return scale_ * estimated_res_; + } /// Pix vs raw, which to use? /// Implementations should provide the ability to source and target Pix @@ -157,7 +178,8 @@ class ImageThresholder { int image_bytespl_; //< Bytes per line of source image/pix. // Limits of image rectangle to be processed. int scale_; //< Scale factor from original image. - int yres_; //< y pixels/inch in source image + int yres_; //< y pixels/inch in source image. + int estimated_res_; //< Resolution estimate from text size. int rect_left_; int rect_top_; int rect_width_; @@ -167,4 +189,3 @@ class ImageThresholder { } // namespace tesseract. #endif // TESSERACT_CCMAIN_THRESHOLDER_H__ -