Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modernize code for renderers and remove filename conversion for Windows #4330

Merged
merged 1 commit into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 7 additions & 25 deletions src/api/altorenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@

#include "errcode.h" // for ASSERT_HOST
#include "helpers.h" // for copy_string
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include "tprintf.h" // for tprintf

#include <tesseract/baseapi.h>
Expand Down Expand Up @@ -145,20 +142,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
int utf8_len =
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

std::stringstream alto_str;
// Use "C" locale (needed for int values larger than 999).
alto_str.imbue(std::locale::classic());
Expand All @@ -169,7 +152,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
<< " WIDTH=\"" << rect_width_ << "\""
<< " HEIGHT=\"" << rect_height_ << "\">\n";

ResultIterator *res_it = GetIterator();
std::unique_ptr<ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
Expand All @@ -186,7 +169,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
// Handle all kinds of images.
// TODO: optionally add TYPE, for example TYPE="photo".
alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</Illustration>\n";
res_it->Next(RIL_BLOCK);
continue;
Expand All @@ -195,7 +178,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
case PT_VERT_LINE:
// Handle horizontal and vertical lines.
alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</GraphicalElement >\n";
res_it->Next(RIL_BLOCK);
continue;
Expand All @@ -208,24 +191,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {

if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "\n";
}

if (res_it->IsAtBeginningOf(RIL_PARA)) {
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
AddBoxToAlto(res_it, RIL_PARA, alto_str);
AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);
alto_str << "\n";
}

if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);
alto_str << "\n";
}

alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
AddBoxToAlto(res_it, RIL_WORD, alto_str);
AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);
alto_str << " CONTENT=\"";

bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
Expand Down Expand Up @@ -272,7 +255,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
alto_str << "\t\t\t</PrintSpace>\n"
<< "\t\t</Page>\n";

delete res_it;
return copy_string(alto_str.str());
}

Expand Down
20 changes: 0 additions & 20 deletions src/api/hocrrenderer.cpp
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code in hocrrenderer.cpp was already modernized in commit db9c7e0.

Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@
#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include <tesseract/renderer.h>
#include "helpers.h" // for copy_string
#include "tesseractclass.h" // for Tesseract
Expand Down Expand Up @@ -151,23 +148,6 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len =
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

std::stringstream hocr_str;
// Use "C" locale (needed for double values x_size and x_descenders).
hocr_str.imbue(std::locale::classic());
Expand Down
43 changes: 11 additions & 32 deletions src/api/pagerenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Description: PAGE XML rendering interface
// Author: Jan Kamlah

// (C) Copyright 2021
// (C) Copyright 2024
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
Expand All @@ -15,9 +15,6 @@

#include "errcode.h" // for ASSERT_HOST
#include "helpers.h" // for copy_string
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include "tprintf.h" // for tprintf

#include <tesseract/baseapi.h>
Expand Down Expand Up @@ -717,23 +714,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len =
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

// Used variables

std::stringstream reading_order_str;
Expand Down Expand Up @@ -788,7 +768,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
<< "\t\t\t<OrderedGroup id=\"ro" << ro_id
<< "\" caption=\"Regions reading order\">\n";

ResultIterator *res_it = GetIterator();
std::unique_ptr<ResultIterator> res_it(GetIterator());

float block_conf = 0;
float line_conf = 0;
Expand All @@ -808,7 +788,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
// Handle all kinds of images.
page_str << "\t\t<GraphicRegion id=\"r" << rcnt++ << "\">\n";
page_str << "\t\t\t";
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
page_str << "\t\t</GraphicRegion>\n";
res_it->Next(RIL_BLOCK);
continue;
Expand All @@ -818,7 +798,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
// Handle horizontal and vertical lines.
page_str << "\t\t<SeparatorRegion id=\"r" << rcnt++ << "\">\n";
page_str << "\t\t\t";
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
page_str << "\t\t</SeparatorRegion>\n";
res_it->Next(RIL_BLOCK);
continue;
Expand Down Expand Up @@ -849,7 +829,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP &&
orientation_block != ORIENTATION_PAGE_DOWN)) &&
LEVELFLAG == 0) {
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
}
}

Expand Down Expand Up @@ -892,9 +872,9 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
line_str << "custom=\"" << "readingOrder {index:" << lcnt << ";}\">\n";
// If level is linebased, get the line polygon and baseline
if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) {
AddPointToWordPolygon(res_it, RIL_TEXTLINE, line_top_ltr_pts,
AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts,
line_bottom_ltr_pts, writing_direction);
AddBaselineToPTA(res_it, RIL_TEXTLINE, line_baseline_pts);
AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts);
if (ttb_flag) {
line_baseline_pts = TransposePolygonline(line_baseline_pts);
}
Expand All @@ -914,18 +894,18 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
<< WritingDirectionToStr(writing_direction) << "\" "
<< "custom=\"" << "readingOrder {index:" << wcnt << ";}\">\n";
if ((!POLYGONFLAG || skewed_flag) || ttb_flag) {
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
writing_direction);
}
}

if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) {
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
writing_direction);
}

// Get the word baseline information
AddBaselineToPTA(res_it, RIL_WORD, word_baseline_pts);
AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts);

// Get the word text content and polygon
do {
Expand All @@ -934,7 +914,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
if (grapheme && grapheme[0] != 0) {
word_content << HOcrEscape(grapheme.get()).c_str();
if (POLYGONFLAG && !skewed_flag && !ttb_flag) {
AddPointToWordPolygon(res_it, RIL_SYMBOL, word_top_pts,
AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts,
word_bottom_pts, writing_direction);
}
}
Expand Down Expand Up @@ -1146,7 +1126,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
const std::string &text = reading_order_str.str();
reading_order_str.str("");

delete res_it;
return copy_string(text);
}

Expand Down
Loading