Skip to content

Commit

Permalink
Merge pull request #245 from tfmorris/result_renderer_tsv
Browse files Browse the repository at this point in the history
Add TSV result renderer. Fixes #18
  • Loading branch information
zdenop committed Mar 3, 2016
2 parents 2597296 + fc80cea commit d55f5fb
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 2 deletions.
120 changes: 120 additions & 0 deletions api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it,
*hocr_str += "\">";
}

static void AddBoxToTSV(const PageIterator *it,
PageIteratorLevel level,
STRING* hocr_str) {
int left, top, right, bottom;
it->BoundingBox(level, &left, &top, &right, &bottom);
hocr_str->add_str_int("\t", left);
hocr_str->add_str_int("\t", top);
hocr_str->add_str_int("\t", right - left + 1);
hocr_str->add_str_int("\t", bottom - top + 1);
}



/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
Expand Down Expand Up @@ -1601,6 +1614,113 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
return ret;
}

/**
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
*/
char* TessBaseAPI::GetTSVText(int page_number) {
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
return NULL;

int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
int page_id = page_number + 1; // we use 1-based page numbers.

STRING tsv_str("");

int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;

tsv_str.add_str_int("1\t", page_num); // level 1 - page
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
tsv_str.add_str_int("\t", rect_left_);
tsv_str.add_str_int("\t", rect_top_);
tsv_str.add_str_int("\t", rect_width_);
tsv_str.add_str_int("\t", rect_height_);
tsv_str += "\t-1\t\n";

ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}

// Add rows for any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
block_num++, par_num = 0, line_num = 0, word_num = 0;
tsv_str.add_str_int("2\t", page_num); // level 2 - block
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
tsv_str += "\t-1\t\n"; // end of row for block
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
par_num++, line_num = 0, word_num = 0;
tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
tsv_str += "\t-1\t\n"; // end of row for para
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
line_num++, word_num = 0;
tsv_str.add_str_int("4\t", page_num); // level 4 - line
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
tsv_str += "\t-1\t\n"; // end of row for line
}

// Now, process the word...
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
const char *font_name;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
&monospace, &serif, &smallcaps,
&pointsize, &font_id);
word_num++;
tsv_str.add_str_int("5\t", page_num); // level 5 - word
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
tsv_str.add_str_int("\t", left);
tsv_str.add_str_int("\t", top);
tsv_str.add_str_int("\t", right - left + 1);
tsv_str.add_str_int("\t", bottom - top + 1);
tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
tsv_str += "\t";

// Increment counts if at end of block/paragraph/textline.
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;

do {
tsv_str += res_it->GetUTF8Text(RIL_SYMBOL);
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
tsv_str += "\n"; // end of row
wcnt++;
}

char *ret = new char[tsv_str.length() + 1];
strcpy(ret, tsv_str.string());
delete res_it;
return ret;
}

/** The 5 numbers output for each box (the usual 4 and a page number.) */
const int kNumbersPerBlob = 5;
/**
Expand Down
6 changes: 6 additions & 0 deletions api/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,12 @@ class TESS_API TessBaseAPI {
*/
char* GetHOCRText(int page_number);

/**
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
*/
char* GetTSVText(int page_number);

/**
* The recognized text is returned as a char* which is coded in the same
* format as a box file used in training. Returned string must be freed with
Expand Down
33 changes: 33 additions & 0 deletions api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,39 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
return true;
}

/**********************************************************************
* HOcr Text Renderer interface implementation
**********************************************************************/
TessTsvRenderer::TessTsvRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = false;
}

TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = font_info;
}

bool TessTsvRenderer::BeginDocumentHandler() {
// Output TSV column headings
AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
return true;
}

bool TessTsvRenderer::EndDocumentHandler() {
return true;
}

bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
char* tsv = api->GetTSVText(imagenum());
if (tsv == NULL) return false;

AppendString(tsv);
delete[] tsv;

return true;
}

/**********************************************************************
* UNLV Text Renderer interface implementation
**********************************************************************/
Expand Down
17 changes: 17 additions & 0 deletions api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,23 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
bool font_info_; // whether to print font information
};

/**
* Renders Tesseract output into a TSV string
*/
class TESS_API TessTsvRenderer : public TessResultRenderer {
public:
explicit TessTsvRenderer(const char *outputbase, bool font_info);
explicit TessTsvRenderer(const char *outputbase);

protected:
virtual bool BeginDocumentHandler();
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool EndDocumentHandler();

private:
bool font_info_; // whether to print font information
};

/**
* Renders tesseract output into searchable PDF
*/
Expand Down
10 changes: 10 additions & 0 deletions api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,14 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
new tesseract::TessHOcrRenderer(outputbase, font_info));
}

api->GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;
api->GetBoolVariable("hocr_font_info", &font_info);
renderers->push_back(
new tesseract::TessTsvRenderer(outputbase, font_info));
}

api->GetBoolVariable("tessedit_create_pdf", &b);
if (b) {
renderers->push_back(new tesseract::TessPDFRenderer(outputbase,
Expand Down Expand Up @@ -422,6 +430,8 @@ int main(int argc, char **argv) {

tesseract::PointerVector<tesseract::TessResultRenderer> renderers;



if (in_training_mode) {
renderers.push_back(NULL);
} else {
Expand Down
3 changes: 3 additions & 0 deletions ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
this->params()),
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
this->params()),
STRING_MEMBER(unrecognised_char, "|",
Expand Down Expand Up @@ -510,6 +512,7 @@ Tesseract::Tesseract()
"Page separator (default is form feed control character)",
this->params()),


// The following parameters were deprecated and removed from their
// original
// locations. The parameters are temporarily kept here to give Tesseract
Expand Down
1 change: 1 addition & 0 deletions ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
STRING_VAR_H(unrecognised_char, "|",
"Output char for unidentified blobs");
Expand Down
4 changes: 2 additions & 2 deletions tessdata/configs/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
datadir = @datadir@/tessdata/configs
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr linebox pdf rebox strokewidth bigram txt
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr linebox pdf rebox strokewidth bigram txt
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt
2 changes: 2 additions & 0 deletions tessdata/configs/tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
tessedit_create_tsv 1
tessedit_pageseg_mode 1

0 comments on commit d55f5fb

Please sign in to comment.