diff --git a/api/baseapi.cpp b/api/baseapi.cpp index f59363f939..ab3cb2fd56 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it, *hocr_str += "\">"; } +static void AddBoxToTSV(const PageIterator *it, + PageIteratorLevel level, + STRING* hocr_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + hocr_str->add_str_int("\t", left); + hocr_str->add_str_int("\t", top); + hocr_str->add_str_int("\t", right - left + 1); + hocr_str->add_str_int("\t", bottom - top + 1); +} + + + /** * Make a HTML-formatted string with hOCR markup from the internal * data structures. @@ -1601,6 +1614,113 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { return ret; } +/** + * Make a TSV-formatted string from the internal data structures. + * page_number is 0-based but will appear in the output as 1-based. + */ +char* TessBaseAPI::GetTSVText(int page_number) { + if (tesseract_ == NULL || + (page_res_ == NULL && Recognize(NULL) < 0)) + return NULL; + + int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; + int page_id = page_number + 1; // we use 1-based page numbers. + + STRING tsv_str(""); + + int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0; + + tsv_str.add_str_int("1\t", page_num); // level 1 - page + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + tsv_str.add_str_int("\t", rect_left_); + tsv_str.add_str_int("\t", rect_top_); + tsv_str.add_str_int("\t", rect_width_); + tsv_str.add_str_int("\t", rect_height_); + tsv_str += "\t-1\t\n"; + + ResultIterator *res_it = GetIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + // Add rows for any new block/paragraph/textline. + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + block_num++, par_num = 0, line_num = 0, word_num = 0; + tsv_str.add_str_int("2\t", page_num); // level 2 - block + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for block + } + if (res_it->IsAtBeginningOf(RIL_PARA)) { + par_num++, line_num = 0, word_num = 0; + tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_PARA, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for para + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + line_num++, word_num = 0; + tsv_str.add_str_int("4\t", page_num); // level 4 - line + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for line + } + + // Now, process the word... + int left, top, right, bottom; + bool bold, italic, underlined, monospace, serif, smallcaps; + int pointsize, font_id; + const char *font_name; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, + &monospace, &serif, &smallcaps, + &pointsize, &font_id); + word_num++; + tsv_str.add_str_int("5\t", page_num); // level 5 - word + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + tsv_str.add_str_int("\t", left); + tsv_str.add_str_int("\t", top); + tsv_str.add_str_int("\t", right - left + 1); + tsv_str.add_str_int("\t", bottom - top + 1); + tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD)); + tsv_str += "\t"; + + // Increment counts if at end of block/paragraph/textline. + if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++; + if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++; + if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++; + + do { + tsv_str += res_it->GetUTF8Text(RIL_SYMBOL); + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + tsv_str += "\n"; // end of row + wcnt++; + } + + char *ret = new char[tsv_str.length() + 1]; + strcpy(ret, tsv_str.string()); + delete res_it; + return ret; +} + /** The 5 numbers output for each box (the usual 4 and a page number.) */ const int kNumbersPerBlob = 5; /** diff --git a/api/baseapi.h b/api/baseapi.h index d75fde9485..c84784a31c 100644 --- a/api/baseapi.h +++ b/api/baseapi.h @@ -602,6 +602,12 @@ class TESS_API TessBaseAPI { */ char* GetHOCRText(int page_number); + /** + * Make a TSV-formatted string from the internal data structures. + * page_number is 0-based but will appear in the output as 1-based. + */ + char* GetTSVText(int page_number); + /** * The recognized text is returned as a char* which is coded in the same * format as a box file used in training. Returned string must be freed with diff --git a/api/renderer.cpp b/api/renderer.cpp index 5050a232a1..172ef49a83 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -179,6 +179,39 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) { return true; } +/********************************************************************** + * HOcr Text Renderer interface implementation + **********************************************************************/ +TessTsvRenderer::TessTsvRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "tsv") { + font_info_ = false; +} + +TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info) + : TessResultRenderer(outputbase, "tsv") { + font_info_ = font_info; +} + +bool TessTsvRenderer::BeginDocumentHandler() { + // Output TSV column headings + AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n"); + return true; +} + +bool TessTsvRenderer::EndDocumentHandler() { + return true; +} + +bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) { + char* tsv = api->GetTSVText(imagenum()); + if (tsv == NULL) return false; + + AppendString(tsv); + delete[] tsv; + + return true; +} + /********************************************************************** * UNLV Text Renderer interface implementation **********************************************************************/ diff --git a/api/renderer.h b/api/renderer.h index 4120f74eb3..6b47813f7b 100644 --- a/api/renderer.h +++ b/api/renderer.h @@ -162,6 +162,23 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { bool font_info_; // whether to print font information }; +/** + * Renders Tesseract output into a TSV string + */ +class TESS_API TessTsvRenderer : public TessResultRenderer { + public: + explicit TessTsvRenderer(const char *outputbase, bool font_info); + explicit TessTsvRenderer(const char *outputbase); + +protected: + virtual bool BeginDocumentHandler(); + virtual bool AddImageHandler(TessBaseAPI* api); + virtual bool EndDocumentHandler(); + +private: + bool font_info_; // whether to print font information +}; + /** * Renders tesseract output into searchable PDF */ diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index db73d72174..9405711350 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -299,6 +299,14 @@ void PreloadRenderers(tesseract::TessBaseAPI* api, new tesseract::TessHOcrRenderer(outputbase, font_info)); } + api->GetBoolVariable("tessedit_create_tsv", &b); + if (b) { + bool font_info; + api->GetBoolVariable("hocr_font_info", &font_info); + renderers->push_back( + new tesseract::TessTsvRenderer(outputbase, font_info)); + } + api->GetBoolVariable("tessedit_create_pdf", &b); if (b) { renderers->push_back(new tesseract::TessPDFRenderer(outputbase, @@ -422,6 +430,8 @@ int main(int argc, char **argv) { tesseract::PointerVector renderers; + + if (in_training_mode) { renderers.push_back(NULL); } else { diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index 04a4b1b0e1..8db50fbd54 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -385,6 +385,8 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params()), + BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", + this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params()), STRING_MEMBER(unrecognised_char, "|", @@ -510,6 +512,7 @@ Tesseract::Tesseract() "Page separator (default is form feed control character)", this->params()), + // The following parameters were deprecated and removed from their // original // locations. The parameters are temporarily kept here to give Tesseract diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index b6976a2dda..91d25bc8ae 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -1003,6 +1003,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file"); BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); + BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); STRING_VAR_H(unrecognised_char, "|", "Output char for unidentified blobs"); diff --git a/tessdata/configs/Makefile.am b/tessdata/configs/Makefile.am index 5715ef741d..a4f1d675c1 100644 --- a/tessdata/configs/Makefile.am +++ b/tessdata/configs/Makefile.am @@ -1,3 +1,3 @@ datadir = @datadir@/tessdata/configs -data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr linebox pdf rebox strokewidth bigram txt -EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr linebox pdf rebox strokewidth bigram txt +data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt +EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt diff --git a/tessdata/configs/tsv b/tessdata/configs/tsv new file mode 100644 index 0000000000..11cd6fc97a --- /dev/null +++ b/tessdata/configs/tsv @@ -0,0 +1,2 @@ +tessedit_create_tsv 1 +tessedit_pageseg_mode 1