From 638868ed38952cc7073210aea695201880c9c5ae Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Wed, 23 Oct 2024 07:34:06 +0200 Subject: [PATCH] Modernize code for renderers and remove filename conversion for Windows (#4330) Commit db52047420c54bd added the filename conversion for the hOCR renderer, but it was removed later for TSV in commit 6700edd8bcdd2a4. Tesseract does not use a filename conversion anywhere else, so remove it for the other renderers, too. Signed-off-by: Stefan Weil --- src/api/altorenderer.cpp | 32 +++++++----------------------- src/api/hocrrenderer.cpp | 20 ------------------- src/api/pagerenderer.cpp | 43 ++++++++++------------------------------ 3 files changed, 18 insertions(+), 77 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 4a17a24820..e373f73aa4 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -15,9 +15,6 @@ #include "errcode.h" // for ASSERT_HOST #include "helpers.h" // for copy_string -#ifdef _WIN32 -# include "host.h" // windows.h for MultiByteToWideChar, ... -#endif #include "tprintf.h" // for tprintf #include @@ -145,20 +142,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { SetInputName(nullptr); } -#ifdef _WIN32 - // convert input name from ANSI encoding to utf-8 - int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0); - wchar_t *uni16_str = new WCHAR[str16_len]; - str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len); - int utf8_len = - WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr); - char *utf8_str = new char[utf8_len]; - WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr); - input_file_ = utf8_str; - delete[] uni16_str; - delete[] utf8_str; -#endif - std::stringstream alto_str; // Use "C" locale (needed for int values larger than 999). alto_str.imbue(std::locale::classic()); @@ -169,7 +152,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { << " WIDTH=\"" << rect_width_ << "\"" << " HEIGHT=\"" << rect_height_ << "\">\n"; - ResultIterator *res_it = GetIterator(); + std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); @@ -186,7 +169,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { // Handle all kinds of images. // TODO: optionally add TYPE, for example TYPE="photo". alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); continue; @@ -195,7 +178,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { case PT_VERT_LINE: // Handle horizontal and vertical lines. alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); continue; @@ -208,24 +191,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { alto_str << "\t\t\t\tIsAtBeginningOf(RIL_PARA)) { alto_str << "\t\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { alto_str << "\t\t\t\t\t\tIsAtFinalElement(RIL_TEXTLINE, RIL_WORD); @@ -272,7 +255,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { alto_str << "\t\t\t\n" << "\t\t\n"; - delete res_it; return copy_string(alto_str.str()); } diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index 3410338efa..5f319ecac1 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -21,9 +21,6 @@ #include // for std::locale::classic #include // for std::unique_ptr #include // for std::stringstream -#ifdef _WIN32 -# include "host.h" // windows.h for MultiByteToWideChar, ... -#endif #include #include "helpers.h" // for copy_string #include "tesseractclass.h" // for Tesseract @@ -151,23 +148,6 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { SetInputName(nullptr); } -#ifdef _WIN32 - // convert input name from ANSI encoding to utf-8 - int str16_len = - MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0); - wchar_t *uni16_str = new WCHAR[str16_len]; - str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, - str16_len); - int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, - 0, nullptr, nullptr); - char *utf8_str = new char[utf8_len]; - WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, - nullptr, nullptr); - input_file_ = utf8_str; - delete[] uni16_str; - delete[] utf8_str; -#endif - std::stringstream hocr_str; // Use "C" locale (needed for double values x_size and x_descenders). hocr_str.imbue(std::locale::classic()); diff --git a/src/api/pagerenderer.cpp b/src/api/pagerenderer.cpp index 2c072626e0..7d67261050 100644 --- a/src/api/pagerenderer.cpp +++ b/src/api/pagerenderer.cpp @@ -2,7 +2,7 @@ // Description: PAGE XML rendering interface // Author: Jan Kamlah -// (C) Copyright 2021 +// (C) Copyright 2024 // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -15,9 +15,6 @@ #include "errcode.h" // for ASSERT_HOST #include "helpers.h" // for copy_string -#ifdef _WIN32 -# include "host.h" // windows.h for MultiByteToWideChar, ... -#endif #include "tprintf.h" // for tprintf #include @@ -717,23 +714,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { SetInputName(nullptr); } -#ifdef _WIN32 - // convert input name from ANSI encoding to utf-8 - int str16_len = - MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0); - wchar_t *uni16_str = new WCHAR[str16_len]; - str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, - str16_len); - int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, - 0, nullptr, nullptr); - char *utf8_str = new char[utf8_len]; - WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, - nullptr, nullptr); - input_file_ = utf8_str; - delete[] uni16_str; - delete[] utf8_str; -#endif - // Used variables std::stringstream reading_order_str; @@ -788,7 +768,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { << "\t\t\t\n"; - ResultIterator *res_it = GetIterator(); + std::unique_ptr res_it(GetIterator()); float block_conf = 0; float line_conf = 0; @@ -808,7 +788,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { // Handle all kinds of images. page_str << "\t\t\n"; page_str << "\t\t\t"; - AddBoxToPAGE(res_it, RIL_BLOCK, page_str); + AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str); page_str << "\t\t\n"; res_it->Next(RIL_BLOCK); continue; @@ -818,7 +798,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { // Handle horizontal and vertical lines. page_str << "\t\t\n"; page_str << "\t\t\t"; - AddBoxToPAGE(res_it, RIL_BLOCK, page_str); + AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str); page_str << "\t\t\n"; res_it->Next(RIL_BLOCK); continue; @@ -849,7 +829,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP && orientation_block != ORIENTATION_PAGE_DOWN)) && LEVELFLAG == 0) { - AddBoxToPAGE(res_it, RIL_BLOCK, page_str); + AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str); } } @@ -892,9 +872,9 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { line_str << "custom=\"" << "readingOrder {index:" << lcnt << ";}\">\n"; // If level is linebased, get the line polygon and baseline if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) { - AddPointToWordPolygon(res_it, RIL_TEXTLINE, line_top_ltr_pts, + AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts, line_bottom_ltr_pts, writing_direction); - AddBaselineToPTA(res_it, RIL_TEXTLINE, line_baseline_pts); + AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts); if (ttb_flag) { line_baseline_pts = TransposePolygonline(line_baseline_pts); } @@ -914,18 +894,18 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { << WritingDirectionToStr(writing_direction) << "\" " << "custom=\"" << "readingOrder {index:" << wcnt << ";}\">\n"; if ((!POLYGONFLAG || skewed_flag) || ttb_flag) { - AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts, + AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts, writing_direction); } } if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) { - AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts, + AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts, writing_direction); } // Get the word baseline information - AddBaselineToPTA(res_it, RIL_WORD, word_baseline_pts); + AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts); // Get the word text content and polygon do { @@ -934,7 +914,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { if (grapheme && grapheme[0] != 0) { word_content << HOcrEscape(grapheme.get()).c_str(); if (POLYGONFLAG && !skewed_flag && !ttb_flag) { - AddPointToWordPolygon(res_it, RIL_SYMBOL, word_top_pts, + AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts, word_bottom_pts, writing_direction); } } @@ -1146,7 +1126,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { const std::string &text = reading_order_str.str(); reading_order_str.str(""); - delete res_it; return copy_string(text); }