Major change to improve layout analysis for heavily diacritic languages:

Tha, Vie, Kan, Tel etc. There is a new overlap detector that detects when diacritics cause a big increase in textline overlap. In such cases, diacritics from overlap regions are kept separate from layout analysis completely, allowing textline formation to happen without them. The diacritics are then assigned to 0, 1 or 2 close words at the end of layout analysis, using and modifying an old noise detection data path. The stored diacritics are used or not during recognition according to the character classifier's liking for them.
stweil · May 12, 2015 · 0e868ef · 0e868ef
1 parent b6d0184
commit 0e868ef
Show file tree

Hide file tree

Showing 34 changed files with 1,856 additions and 744 deletions.
diff --git a/ccmain/control.cpp b/ccmain/control.cpp
diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp
@@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
     if ((!word->part_of_combo) && (word->box_word == NULL)) {
       WordData word_data(block, row, word);
       SetupWordPassN(2, &word_data);
-      classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
-                                 &word_data);
+      classify_word_and_language(2, NULL, &word_data);
     }
     prev_word_best_choice_ = word->best_choice;
   }

diff --git a/ccmain/pageiterator.cpp b/ccmain/pageiterator.cpp
@@ -26,15 +26,23 @@
 
 namespace tesseract {
 
-PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
-                           int scale, int scaled_yres,
-                           int rect_left, int rect_top,
+PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
+                           int scaled_yres, int rect_left, int rect_top,
                            int rect_width, int rect_height)
-  : page_res_(page_res), tesseract_(tesseract),
-    word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL),
-    scale_(scale), scaled_yres_(scaled_yres),
-    rect_left_(rect_left), rect_top_(rect_top),
-    rect_width_(rect_width), rect_height_(rect_height) {
+    : page_res_(page_res),
+      tesseract_(tesseract),
+      word_(NULL),
+      word_length_(0),
+      blob_index_(0),
+      cblob_it_(NULL),
+      include_upper_dots_(false),
+      include_lower_dots_(false),
+      scale_(scale),
+      scaled_yres_(scaled_yres),
+      rect_left_(rect_left),
+      rect_top_(rect_top),
+      rect_width_(rect_width),
+      rect_height_(rect_height) {
   it_ = new PAGE_RES_IT(page_res);
   PageIterator::Begin();
 }
@@ -50,19 +58,29 @@ PageIterator::~PageIterator() {
  * objects at a higher level.
  */
 PageIterator::PageIterator(const PageIterator& src)
-  : page_res_(src.page_res_), tesseract_(src.tesseract_),
-    word_(NULL), word_length_(src.word_length_),
-    blob_index_(src.blob_index_), cblob_it_(NULL),
-    scale_(src.scale_), scaled_yres_(src.scaled_yres_),
-    rect_left_(src.rect_left_), rect_top_(src.rect_top_),
-    rect_width_(src.rect_width_), rect_height_(src.rect_height_) {
+    : page_res_(src.page_res_),
+      tesseract_(src.tesseract_),
+      word_(NULL),
+      word_length_(src.word_length_),
+      blob_index_(src.blob_index_),
+      cblob_it_(NULL),
+      include_upper_dots_(src.include_upper_dots_),
+      include_lower_dots_(src.include_lower_dots_),
+      scale_(src.scale_),
+      scaled_yres_(src.scaled_yres_),
+      rect_left_(src.rect_left_),
+      rect_top_(src.rect_top_),
+      rect_width_(src.rect_width_),
+      rect_height_(src.rect_height_) {
   it_ = new PAGE_RES_IT(*src.it_);
   BeginWord(src.blob_index_);
 }
 
 const PageIterator& PageIterator::operator=(const PageIterator& src) {
   page_res_ = src.page_res_;
   tesseract_ = src.tesseract_;
+  include_upper_dots_ = src.include_upper_dots_;
+  include_lower_dots_ = src.include_lower_dots_;
   scale_ = src.scale_;
   scaled_yres_ = src.scaled_yres_;
   rect_left_ = src.rect_left_;
@@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
   PARA *para = NULL;
   switch (level) {
     case RIL_BLOCK:
-      box = it_->block()->block->bounding_box();
+      box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
+                                                         include_lower_dots_);
       break;
     case RIL_PARA:
       para = it_->row()->row->para();
       // explicit fall-through.
     case RIL_TEXTLINE:
-      box = it_->row()->row->bounding_box();
+      box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
+                                                     include_lower_dots_);
       break;
     case RIL_WORD:
-      box = it_->word()->word->bounding_box();
+      box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
+                                                       include_lower_dots_);
       break;
     case RIL_SYMBOL:
       if (cblob_it_ == NULL)
@@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
   int left, top, right, bottom;
   if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
     return NULL;
-  Pix* pix = NULL;
-  switch (level) {
-    case RIL_BLOCK:
-    case RIL_PARA:
-      int bleft, btop, bright, bbottom;
-      BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom);
-      pix = it_->block()->block->render_mask();
-      // AND the mask and the image.
-      pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
-                  PIX_SRC & PIX_DST, tesseract_->pix_binary(),
-                  bleft, btop);
-      if (level == RIL_PARA) {
-        // RIL_PARA needs further attention:
-        //   clip the paragraph from the block mask.
-        Box* box = boxCreate(left - bleft, top - btop,
-                             right - left, bottom - top);
-        Pix* pix2 = pixClipRectangle(pix, box, NULL);
-        boxDestroy(&box);
-        pixDestroy(&pix);
-        pix = pix2;
-      }
-      break;
-    case RIL_TEXTLINE:
-    case RIL_WORD:
-    case RIL_SYMBOL:
-      if (level == RIL_SYMBOL && cblob_it_ != NULL &&
-          cblob_it_->data()->area() != 0)
-        return cblob_it_->data()->render();
-      // Just clip from the bounding box.
-      Box* box = boxCreate(left, top, right - left, bottom - top);
-      pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
-      boxDestroy(&box);
-      break;
+  if (level == RIL_SYMBOL && cblob_it_ != NULL &&
+      cblob_it_->data()->area() != 0)
+    return cblob_it_->data()->render();
+  Box* box = boxCreate(left, top, right - left, bottom - top);
+  Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
+  boxDestroy(&box);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    int mask_x = left - mask_box.left();
+    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
+    // AND the mask and pix, putting the result in pix.
+    pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix),
+                pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x),
+                MAX(0, mask_y));
+    pixDestroy(&mask);
   }
   return pix;
 }
@@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
   Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
   Pix* grey_pix = pixClipRectangle(original_img, box, NULL);
   boxDestroy(&box);
-  if (level == RIL_BLOCK) {
-    Pix* mask = it_->block()->block->render_mask();
-    Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
-    pixRasterop(expanded_mask, padding, padding,
-                pixGetWidth(mask), pixGetHeight(mask),
-                PIX_SRC, mask, 0, 0);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    // Copy the mask registered correctly into an image the size of grey_pix.
+    int mask_x = *left - mask_box.left();
+    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
+    int width = pixGetWidth(grey_pix);
+    int height = pixGetHeight(grey_pix);
+    Pix* resized_mask = pixCreate(width, height, 1);
+    pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height,
+                PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y));
     pixDestroy(&mask);
-    pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1);
-    pixInvert(expanded_mask, expanded_mask);
-    pixSetMasked(grey_pix, expanded_mask, MAX_UINT32);
-    pixDestroy(&expanded_mask);
+    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
+                   2 * padding + 1);
+    pixInvert(resized_mask, resized_mask);
+    pixSetMasked(grey_pix, resized_mask, MAX_UINT32);
+    pixDestroy(&resized_mask);
   }
   return grey_pix;
 }

diff --git a/ccmain/pageiterator.h b/ccmain/pageiterator.h
@@ -179,6 +179,21 @@ class TESS_API PageIterator {
   // If an image rectangle has been set in the API, then returned coordinates
   // relate to the original (full) image, rather than the rectangle.
 
+  /**
+   * Controls what to include in a bounding box. Bounding boxes of all levels
+   * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
+   * Between layout analysis and recognition, it isn't known where all
+   * diacritics belong, so this control is used to include or exclude some
+   * diacritics that are above or below the main body of the word. In most cases
+   * where the placement is obvious, and after recognition, it doesn't make as
+   * much difference, as the diacritics will already be included in the word.
+   */
+  void SetBoundingBoxComponents(bool include_upper_dots,
+                                bool include_lower_dots) {
+    include_upper_dots_ = include_upper_dots;
+    include_lower_dots_ = include_lower_dots;
+  }
+
   /**
    * Returns the bounding rectangle of the current object at the given level.
    * See comment on coordinate system above.
@@ -332,6 +347,9 @@ class TESS_API PageIterator {
    * Owned by this ResultIterator.
    */
   C_BLOB_IT* cblob_it_;
+  /** Control over what to include in bounding boxes. */
+  bool include_upper_dots_;
+  bool include_lower_dots_;
   /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
   int scale_;
   int scaled_yres_;

diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp
@@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
     // UNLV file present. Use PSM_SINGLE_BLOCK.
     pageseg_mode = PSM_SINGLE_BLOCK;
   }
+  // The diacritic_blobs holds noise blobs that may be diacritics. They
+  // are separated out on areas of the image that seem noisy and short-circuit
+  // the layout process, going straight from the initial partition creation
+  // right through to after word segmentation, where they are added to the
+  // rej_cblobs list of the most appropriate word. From there classification
+  // will determine whether they are used.
+  BLOBNBOX_LIST diacritic_blobs;
   int auto_page_seg_ret_val = 0;
   TO_BLOCK_LIST to_blocks;
   if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
       PSM_SPARSE(pageseg_mode)) {
-    auto_page_seg_ret_val =
-        AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
+    auto_page_seg_ret_val = AutoPageSeg(
+        pageseg_mode, blocks, &to_blocks,
+        enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
     if (pageseg_mode == PSM_OSD_ONLY)
       return auto_page_seg_ret_val;
     // To create blobs from the image region bounds uncomment this line:
@@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
 
   textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                        pix_thresholds_, pix_grey_, splitting || cjk_mode,
-                       blocks, &to_blocks);
+                       &diacritic_blobs, blocks, &to_blocks);
   return auto_page_seg_ret_val;
 }
 
@@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
   pixDestroy(&grey_pix);
 }
 
-
 /**
  * Auto page segmentation. Divide the page image into blocks of uniform
  * text linespacing and images.
@@ -207,19 +214,25 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
  * The output goes in the blocks list with corresponding TO_BLOCKs in the
  * to_blocks list.
  *
- * If single_column is true, then no attempt is made to divide the image
- * into columns, but multiple blocks are still made if the text is of
- * non-uniform linespacing.
+ * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
+ * the image into columns, but multiple blocks are still made if the text is
+ * of non-uniform linespacing.
+ *
+ * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+ * confuse layout anaylsis by causing textline overlap, are placed there,
+ * with the expectation that they will be reassigned to words later and
+ * noise/diacriticness determined via classification.
  *
  * If osd (orientation and script detection) is true then that is performed
  * as well. If only_osd is true, then only orientation and script detection is
  * performed. If osd is desired, (osd or only_osd) then osr_tess must be
  * another Tesseract that was initialized especially for osd, and the results
  * will be output into osr (orientation and script result).
  */
-int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
-                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
-                           Tesseract* osd_tess, OSResults* osr) {
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+                           TO_BLOCK_LIST* to_blocks,
+                           BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
+                           OSResults* osr) {
   if (textord_debug_images) {
     WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
   }
@@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
     if (equ_detect_) {
       finder->SetEquationDetect(equ_detect_);
     }
-    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
-                                to_block, photomask_pix,
-                                pix_thresholds_, pix_grey_,
-                                &found_blocks, to_blocks);
+    result = finder->FindBlocks(
+        pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
+        pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
     if (result >= 0)
       finder->GetDeskewVectors(&deskew_, &reskew_);
     delete finder;

diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp
@@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) {
   FCOORD pt(x, y);
   PAGE_RES_IT pr_it(page_res);
 
-  char msg[160];
+  const int kBufsize = 512;
+  char msg[kBufsize];
   char *msg_ptr = msg;
 
   msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);

diff --git a/ccmain/recogtraining.cpp b/ccmain/recogtraining.cpp
@@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label,
   fflush(stdout);
   WordData word_data(*pr_it);
   SetupWordPassN(1, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass1,
-                             pr_it, &word_data);
+  classify_word_and_language(1, pr_it, &word_data);
   WERD_RES* werd_res = word_data.word;
   WERD_CHOICE *best_choice = werd_res->best_choice;
   ASSERT_HOST(best_choice != NULL);