Skip to content

Commit

Permalink
Cleanup TSV renderer
Browse files Browse the repository at this point in the history
Remove all references to hocr, hocr.tsv, etc. Remove dead code for font
info, input filename, HTML escapes. Improved comments. Fixed
indentation.
  • Loading branch information
tfmorris committed Mar 1, 2016
1 parent 858f4b7 commit 6700edd
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 125 deletions.
159 changes: 59 additions & 100 deletions api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1417,7 +1417,7 @@ static void AddBoxTohOCR(const ResultIterator *it,
*hocr_str += "\">";
}

static void AddBoxTohOCRTSV(const PageIterator *it,
static void AddBoxToTSV(const PageIterator *it,
PageIteratorLevel level,
STRING* hocr_str) {
int left, top, right, bottom;
Expand Down Expand Up @@ -1615,57 +1615,31 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
}

/**
* Make a TSV-formatted string with hOCR markup from the internal
* data structures.
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
* Image name/input_file_ can be set by SetInputName before calling
* GetHOCRText
* STL removed from original patch submission and refactored by rays.
*/
char* TessBaseAPI::GetHOCRTSVText(int page_number) {
char* TessBaseAPI::GetTSVText(int page_number) {
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
return NULL;

int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool font_info = false;
GetBoolVariable("hocr_font_info", &font_info);

STRING hocr_str("");
int page_id = page_number + 1; // we use 1-based page numbers.

if (input_file_ == NULL)
SetInputName(NULL);

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
NULL, NULL);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
uni16_str, str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
NULL, NULL, NULL);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
utf8_len, NULL, NULL);
*input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif
STRING tsv_str("");

int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;

hocr_str.add_str_int("1\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
hocr_str.add_str_int("\t", rect_left_);
hocr_str.add_str_int("\t", rect_top_);
hocr_str.add_str_int("\t", rect_width_);
hocr_str.add_str_int("\t", rect_height_);
hocr_str += "\t-1\t\n";
tsv_str.add_str_int("1\t", page_num); // level 1 - page
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
tsv_str.add_str_int("\t", rect_left_);
tsv_str.add_str_int("\t", rect_top_);
tsv_str.add_str_int("\t", rect_width_);
tsv_str.add_str_int("\t", rect_height_);
tsv_str += "\t-1\t\n";

ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
Expand All @@ -1674,36 +1648,36 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
continue;
}

// Open any new block/paragraph/textline.
// Add rows for any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
block_num++, par_num = 0, line_num = 0, word_num = 0;
hocr_str.add_str_int("2\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
hocr_str += "\t-1\t\n";
tsv_str.add_str_int("2\t", page_num); // level 2 - block
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
tsv_str += "\t-1\t\n"; // end of row for block
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
par_num++, line_num = 0, word_num = 0;
hocr_str.add_str_int("3\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
hocr_str += "\t-1\t\n";
tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
tsv_str += "\t-1\t\n"; // end of row for para
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
line_num++, word_num = 0;
hocr_str.add_str_int("4\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
hocr_str += "\t-1\t\n";
tsv_str.add_str_int("4\t", page_num); // level 4 - line
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
tsv_str += "\t-1\t\n"; // end of row for line
}

// Now, process the word...
Expand All @@ -1715,49 +1689,34 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
&monospace, &serif, &smallcaps,
&pointsize, &font_id);
word_num++;
hocr_str.add_str_int("5\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
hocr_str.add_str_int("\t", left);
hocr_str.add_str_int("\t", top);
hocr_str.add_str_int("\t", right - left + 1);
hocr_str.add_str_int("\t", bottom - top + 1);
hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
hocr_str += "\t";
word_num++;
tsv_str.add_str_int("5\t", page_num); // level 5 - word
tsv_str.add_str_int("\t", block_num);
tsv_str.add_str_int("\t", par_num);
tsv_str.add_str_int("\t", line_num);
tsv_str.add_str_int("\t", word_num);
tsv_str.add_str_int("\t", left);
tsv_str.add_str_int("\t", top);
tsv_str.add_str_int("\t", right - left + 1);
tsv_str.add_str_int("\t", bottom - top + 1);
tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
tsv_str += "\t";

// Increment counts if at end of block/paragraph/textline.
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;

do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
// if (grapheme && grapheme[0] != 0) {
// if (grapheme[1] == 0) {
// hocr_str += HOcrEscape(grapheme);
// } else {
hocr_str += grapheme;
// }
// }
delete []grapheme;
tsv_str += res_it->GetUTF8Text(RIL_SYMBOL);
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
hocr_str += "\n";
tsv_str += "\n"; // end of row
wcnt++;
// Close any ending block/paragraph/textline.
if (last_word_in_line) {
lcnt++;
}
if (last_word_in_para) {
pcnt++;
}
if (last_word_in_block) {
bcnt++;
}
}

char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());
char *ret = new char[tsv_str.length() + 1];
strcpy(ret, tsv_str.string());
delete res_it;
return ret;
}
Expand Down
6 changes: 2 additions & 4 deletions api/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -603,12 +603,10 @@ class TESS_API TessBaseAPI {
char* GetHOCRText(int page_number);

/**
* Make a TSV-formatted string with hOCR markup from the internal
* data structures.
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
*/
char* GetHOCRTSVText(int page_number);

char* GetTSVText(int page_number);

/**
* The recognized text is returned as a char* which is coded in the same
Expand Down
23 changes: 12 additions & 11 deletions api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,31 +182,32 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
/**********************************************************************
* HOcr Text Renderer interface implementation
**********************************************************************/
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "hocr.tsv") {
TessTsvRenderer::TessTsvRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = false;
}

TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "hocr.tsv") {
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = font_info;
}

bool TessHOcrTsvRenderer::BeginDocumentHandler() {
bool TessTsvRenderer::BeginDocumentHandler() {
// Output TSV column headings
AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
return true;
}

bool TessHOcrTsvRenderer::EndDocumentHandler() {
bool TessTsvRenderer::EndDocumentHandler() {
return true;
}

bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
char* hocrtsv = api->GetHOCRTSVText(imagenum());
if (hocrtsv == NULL) return false;
bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
char* tsv = api->GetTSVText(imagenum());
if (tsv == NULL) return false;

AppendString(hocrtsv);
delete[] hocrtsv;
AppendString(tsv);
delete[] tsv;

return true;
}
Expand Down
8 changes: 4 additions & 4 deletions api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,12 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
};

/**
* Renders tesseract output into an hocr tsv string
* Renders Tesseract output into a TSV string
*/
class TESS_API TessHOcrTsvRenderer : public TessResultRenderer {
class TESS_API TessTsvRenderer : public TessResultRenderer {
public:
explicit TessHOcrTsvRenderer(const char *outputbase, bool font_info);
explicit TessHOcrTsvRenderer(const char *outputbase);
explicit TessTsvRenderer(const char *outputbase, bool font_info);
explicit TessTsvRenderer(const char *outputbase);

protected:
virtual bool BeginDocumentHandler();
Expand Down
4 changes: 2 additions & 2 deletions api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,12 +299,12 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
new tesseract::TessHOcrRenderer(outputbase, font_info));
}

api->GetBoolVariable("tessedit_create_hocrtsv", &b);
api->GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;
api->GetBoolVariable("hocr_font_info", &font_info);
renderers->push_back(
new tesseract::TessHOcrTsvRenderer(outputbase, font_info));
new tesseract::TessTsvRenderer(outputbase, font_info));
}

api->GetBoolVariable("tessedit_create_pdf", &b);
Expand Down
2 changes: 1 addition & 1 deletion ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
this->params()),
BOOL_MEMBER(tessedit_create_hocrtsv, false, "Write .hocr.tsv TSV output file",
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
this->params()),
Expand Down
2 changes: 1 addition & 1 deletion ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1003,7 +1003,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_hocrtsv, false, "Write .hocr.tsv hOCR-tsv output file");
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
STRING_VAR_H(unrecognised_char, "|",
"Output char for unidentified blobs");
Expand Down
2 changes: 0 additions & 2 deletions tessdata/configs/hocrtsv

This file was deleted.

2 changes: 2 additions & 0 deletions tessdata/configs/tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
tessedit_create_tsv 1
tessedit_pageseg_mode 1

0 comments on commit 6700edd

Please sign in to comment.