Skip to content

Commit

Permalink
Major change to improve layout analysis for heavily diacritic languages:
Browse files Browse the repository at this point in the history
Tha, Vie, Kan, Tel etc.
There is a new overlap detector that detects when diacritics
cause a big increase in textline overlap. In such cases, diacritics from
overlap regions are kept separate from layout analysis completely, allowing
textline formation to happen without them. The diacritics are then assigned
to 0, 1 or 2 close words at the end of layout analysis, using and modifying
an old noise detection data path.
The stored diacritics are used or not during recognition according to the
character classifier's liking for them.
  • Loading branch information
theraysmith committed May 12, 2015
1 parent b6d0184 commit 0e868ef
Show file tree
Hide file tree
Showing 34 changed files with 1,856 additions and 744 deletions.
378 changes: 370 additions & 8 deletions ccmain/control.cpp

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions ccmain/fixspace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
if ((!word->part_of_combo) && (word->box_word == NULL)) {
WordData word_data(block, row, word);
SetupWordPassN(2, &word_data);
classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
&word_data);
classify_word_and_language(2, NULL, &word_data);
}
prev_word_best_choice_ = word->best_choice;
}
Expand Down
132 changes: 72 additions & 60 deletions ccmain/pageiterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,23 @@

namespace tesseract {

PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
int scale, int scaled_yres,
int rect_left, int rect_top,
PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
int scaled_yres, int rect_left, int rect_top,
int rect_width, int rect_height)
: page_res_(page_res), tesseract_(tesseract),
word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL),
scale_(scale), scaled_yres_(scaled_yres),
rect_left_(rect_left), rect_top_(rect_top),
rect_width_(rect_width), rect_height_(rect_height) {
: page_res_(page_res),
tesseract_(tesseract),
word_(NULL),
word_length_(0),
blob_index_(0),
cblob_it_(NULL),
include_upper_dots_(false),
include_lower_dots_(false),
scale_(scale),
scaled_yres_(scaled_yres),
rect_left_(rect_left),
rect_top_(rect_top),
rect_width_(rect_width),
rect_height_(rect_height) {
it_ = new PAGE_RES_IT(page_res);
PageIterator::Begin();
}
Expand All @@ -50,19 +58,29 @@ PageIterator::~PageIterator() {
* objects at a higher level.
*/
PageIterator::PageIterator(const PageIterator& src)
: page_res_(src.page_res_), tesseract_(src.tesseract_),
word_(NULL), word_length_(src.word_length_),
blob_index_(src.blob_index_), cblob_it_(NULL),
scale_(src.scale_), scaled_yres_(src.scaled_yres_),
rect_left_(src.rect_left_), rect_top_(src.rect_top_),
rect_width_(src.rect_width_), rect_height_(src.rect_height_) {
: page_res_(src.page_res_),
tesseract_(src.tesseract_),
word_(NULL),
word_length_(src.word_length_),
blob_index_(src.blob_index_),
cblob_it_(NULL),
include_upper_dots_(src.include_upper_dots_),
include_lower_dots_(src.include_lower_dots_),
scale_(src.scale_),
scaled_yres_(src.scaled_yres_),
rect_left_(src.rect_left_),
rect_top_(src.rect_top_),
rect_width_(src.rect_width_),
rect_height_(src.rect_height_) {
it_ = new PAGE_RES_IT(*src.it_);
BeginWord(src.blob_index_);
}

const PageIterator& PageIterator::operator=(const PageIterator& src) {
page_res_ = src.page_res_;
tesseract_ = src.tesseract_;
include_upper_dots_ = src.include_upper_dots_;
include_lower_dots_ = src.include_lower_dots_;
scale_ = src.scale_;
scaled_yres_ = src.scaled_yres_;
rect_left_ = src.rect_left_;
Expand Down Expand Up @@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
PARA *para = NULL;
switch (level) {
case RIL_BLOCK:
box = it_->block()->block->bounding_box();
box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
include_lower_dots_);
break;
case RIL_PARA:
para = it_->row()->row->para();
// explicit fall-through.
case RIL_TEXTLINE:
box = it_->row()->row->bounding_box();
box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
include_lower_dots_);
break;
case RIL_WORD:
box = it_->word()->word->bounding_box();
box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
include_lower_dots_);
break;
case RIL_SYMBOL:
if (cblob_it_ == NULL)
Expand Down Expand Up @@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
int left, top, right, bottom;
if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
return NULL;
Pix* pix = NULL;
switch (level) {
case RIL_BLOCK:
case RIL_PARA:
int bleft, btop, bright, bbottom;
BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom);
pix = it_->block()->block->render_mask();
// AND the mask and the image.
pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
PIX_SRC & PIX_DST, tesseract_->pix_binary(),
bleft, btop);
if (level == RIL_PARA) {
// RIL_PARA needs further attention:
// clip the paragraph from the block mask.
Box* box = boxCreate(left - bleft, top - btop,
right - left, bottom - top);
Pix* pix2 = pixClipRectangle(pix, box, NULL);
boxDestroy(&box);
pixDestroy(&pix);
pix = pix2;
}
break;
case RIL_TEXTLINE:
case RIL_WORD:
case RIL_SYMBOL:
if (level == RIL_SYMBOL && cblob_it_ != NULL &&
cblob_it_->data()->area() != 0)
return cblob_it_->data()->render();
// Just clip from the bounding box.
Box* box = boxCreate(left, top, right - left, bottom - top);
pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
boxDestroy(&box);
break;
if (level == RIL_SYMBOL && cblob_it_ != NULL &&
cblob_it_->data()->area() != 0)
return cblob_it_->data()->render();
Box* box = boxCreate(left, top, right - left, bottom - top);
Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
boxDestroy(&box);
if (level == RIL_BLOCK || level == RIL_PARA) {
// Clip to the block polygon as well.
TBOX mask_box;
Pix* mask = it_->block()->block->render_mask(&mask_box);
int mask_x = left - mask_box.left();
int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
// AND the mask and pix, putting the result in pix.
pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix),
pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x),
MAX(0, mask_y));
pixDestroy(&mask);
}
return pix;
}
Expand Down Expand Up @@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
Pix* grey_pix = pixClipRectangle(original_img, box, NULL);
boxDestroy(&box);
if (level == RIL_BLOCK) {
Pix* mask = it_->block()->block->render_mask();
Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
pixRasterop(expanded_mask, padding, padding,
pixGetWidth(mask), pixGetHeight(mask),
PIX_SRC, mask, 0, 0);
if (level == RIL_BLOCK || level == RIL_PARA) {
// Clip to the block polygon as well.
TBOX mask_box;
Pix* mask = it_->block()->block->render_mask(&mask_box);
// Copy the mask registered correctly into an image the size of grey_pix.
int mask_x = *left - mask_box.left();
int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
int width = pixGetWidth(grey_pix);
int height = pixGetHeight(grey_pix);
Pix* resized_mask = pixCreate(width, height, 1);
pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height,
PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y));
pixDestroy(&mask);
pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1);
pixInvert(expanded_mask, expanded_mask);
pixSetMasked(grey_pix, expanded_mask, MAX_UINT32);
pixDestroy(&expanded_mask);
pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
2 * padding + 1);
pixInvert(resized_mask, resized_mask);
pixSetMasked(grey_pix, resized_mask, MAX_UINT32);
pixDestroy(&resized_mask);
}
return grey_pix;
}
Expand Down
18 changes: 18 additions & 0 deletions ccmain/pageiterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,21 @@ class TESS_API PageIterator {
// If an image rectangle has been set in the API, then returned coordinates
// relate to the original (full) image, rather than the rectangle.

/**
* Controls what to include in a bounding box. Bounding boxes of all levels
* between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
* Between layout analysis and recognition, it isn't known where all
* diacritics belong, so this control is used to include or exclude some
* diacritics that are above or below the main body of the word. In most cases
* where the placement is obvious, and after recognition, it doesn't make as
* much difference, as the diacritics will already be included in the word.
*/
void SetBoundingBoxComponents(bool include_upper_dots,
bool include_lower_dots) {
include_upper_dots_ = include_upper_dots;
include_lower_dots_ = include_lower_dots;
}

/**
* Returns the bounding rectangle of the current object at the given level.
* See comment on coordinate system above.
Expand Down Expand Up @@ -332,6 +347,9 @@ class TESS_API PageIterator {
* Owned by this ResultIterator.
*/
C_BLOB_IT* cblob_it_;
/** Control over what to include in bounding boxes. */
bool include_upper_dots_;
bool include_lower_dots_;
/** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
int scale_;
int scaled_yres_;
Expand Down
40 changes: 26 additions & 14 deletions ccmain/pagesegmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
// UNLV file present. Use PSM_SINGLE_BLOCK.
pageseg_mode = PSM_SINGLE_BLOCK;
}
// The diacritic_blobs holds noise blobs that may be diacritics. They
// are separated out on areas of the image that seem noisy and short-circuit
// the layout process, going straight from the initial partition creation
// right through to after word segmentation, where they are added to the
// rej_cblobs list of the most appropriate word. From there classification
// will determine whether they are used.
BLOBNBOX_LIST diacritic_blobs;
int auto_page_seg_ret_val = 0;
TO_BLOCK_LIST to_blocks;
if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
PSM_SPARSE(pageseg_mode)) {
auto_page_seg_ret_val =
AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
auto_page_seg_ret_val = AutoPageSeg(
pageseg_mode, blocks, &to_blocks,
enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
if (pageseg_mode == PSM_OSD_ONLY)
return auto_page_seg_ret_val;
// To create blobs from the image region bounds uncomment this line:
Expand Down Expand Up @@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,

textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
pix_thresholds_, pix_grey_, splitting || cjk_mode,
blocks, &to_blocks);
&diacritic_blobs, blocks, &to_blocks);
return auto_page_seg_ret_val;
}

Expand All @@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
pixDestroy(&grey_pix);
}


/**
* Auto page segmentation. Divide the page image into blocks of uniform
* text linespacing and images.
Expand All @@ -207,19 +214,25 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
* The output goes in the blocks list with corresponding TO_BLOCKs in the
* to_blocks list.
*
* If single_column is true, then no attempt is made to divide the image
* into columns, but multiple blocks are still made if the text is of
* non-uniform linespacing.
* If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
* the image into columns, but multiple blocks are still made if the text is
* of non-uniform linespacing.
*
* If diacritic_blobs is non-null, then diacritics/noise blobs, that would
* confuse layout anaylsis by causing textline overlap, are placed there,
* with the expectation that they will be reassigned to words later and
* noise/diacriticness determined via classification.
*
* If osd (orientation and script detection) is true then that is performed
* as well. If only_osd is true, then only orientation and script detection is
* performed. If osd is desired, (osd or only_osd) then osr_tess must be
* another Tesseract that was initialized especially for osd, and the results
* will be output into osr (orientation and script result).
*/
int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
Tesseract* osd_tess, OSResults* osr) {
int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
TO_BLOCK_LIST* to_blocks,
BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
OSResults* osr) {
if (textord_debug_images) {
WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
}
Expand Down Expand Up @@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
if (equ_detect_) {
finder->SetEquationDetect(equ_detect_);
}
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
to_block, photomask_pix,
pix_thresholds_, pix_grey_,
&found_blocks, to_blocks);
result = finder->FindBlocks(
pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
if (result >= 0)
finder->GetDeskewVectors(&deskew_, &reskew_);
delete finder;
Expand Down
3 changes: 2 additions & 1 deletion ccmain/pgedit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) {
FCOORD pt(x, y);
PAGE_RES_IT pr_it(page_res);

char msg[160];
const int kBufsize = 512;
char msg[kBufsize];
char *msg_ptr = msg;

msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
Expand Down
3 changes: 1 addition & 2 deletions ccmain/recogtraining.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label,
fflush(stdout);
WordData word_data(*pr_it);
SetupWordPassN(1, &word_data);
classify_word_and_language(&Tesseract::classify_word_pass1,
pr_it, &word_data);
classify_word_and_language(1, pr_it, &word_data);
WERD_RES* werd_res = word_data.word;
WERD_CHOICE *best_choice = werd_res->best_choice;
ASSERT_HOST(best_choice != NULL);
Expand Down
Loading

0 comments on commit 0e868ef

Please sign in to comment.