From da03e4e9105b6262706d40ef2b4436eae4ebe19f Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Fri, 14 Jul 2017 09:30:14 -0700 Subject: [PATCH] Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion --- api/baseapi.cpp | 21 +- api/pdfrenderer.cpp | 35 +- api/renderer.cpp | 2 +- api/tesseractmain.cpp | 38 +- ccmain/control.cpp | 12 +- ccmain/docqual.cpp | 6 +- ccmain/equationdetect.cpp | 8 +- ccmain/paragraphs.cpp | 2 +- ccmain/paramsd.cpp | 6 +- ccmain/pgedit.cpp | 3 +- ccmain/thresholder.cpp | 4 +- ccstruct/boxread.cpp | 2 +- ccstruct/coutln.cpp | 20 +- ccstruct/coutln.h | 6 +- ccstruct/ocrblock.cpp | 10 +- ccstruct/otsuthr.cpp | 3 +- ccstruct/pdblock.cpp | 15 +- ccstruct/polyblk.cpp | 11 +- ccstruct/ratngs.h | 6 +- ccstruct/rejctmap.cpp | 78 +- ccstruct/rejctmap.h | 37 +- ccstruct/statistc.cpp | 8 +- ccstruct/stepblob.h | 4 +- ccutil/ambigs.cpp | 4 +- ccutil/ambigs.h | 6 +- ccutil/basedir.cpp | 5 +- ccutil/genericvector.h | 26 +- ccutil/helpers.h | 2 +- ccutil/host.h | 8 +- ccutil/memry.cpp | 8 - ccutil/memry.h | 4 - ccutil/params.cpp | 4 +- ccutil/strngs.h | 10 +- ccutil/tessdatamanager.cpp | 16 + ccutil/tessdatamanager.h | 5 + ccutil/unichar.cpp | 19 +- ccutil/unichar.h | 19 +- ccutil/unicharcompress.cpp | 6 +- ccutil/unicharset.cpp | 16 +- classify/adaptive.cpp | 10 +- classify/adaptmatch.cpp | 10 +- classify/cluster.cpp | 24 +- classify/clusttool.cpp | 2 +- classify/featdefs.cpp | 6 +- classify/featdefs.h | 4 +- classify/kdtree.cpp | 22 +- classify/mfoutline.cpp | 3 +- classify/ocrfeatures.cpp | 9 +- classify/protos.cpp | 3 +- classify/shapetable.h | 8 +- cutil/oldlist.cpp | 175 ++--- cutil/oldlist.h | 6 +- dict/dawg.h | 2 +- dict/dict.cpp | 6 +- dict/dict.h | 4 +- dict/permdawg.cpp | 2 +- dict/trie.cpp | 4 +- opencl/openclwrapper.cpp | 950 ++++++++++++------------- opencl/openclwrapper.h | 2 +- textord/bbgrid.h | 12 +- textord/colpartition.h | 6 +- textord/drawedg.cpp | 7 +- textord/makerow.cpp | 2 +- textord/scanedg.cpp | 11 +- textord/tabvector.h | 4 +- training/boxchar.cpp | 35 +- training/commontraining.cpp | 14 +- training/normstrngs.cpp | 98 +-- training/normstrngs.h | 6 +- training/pango_font_info.cpp | 39 + training/pango_font_info.h | 10 + training/stringrenderer.cpp | 24 +- training/stringrenderer.h | 2 +- training/unicharset_extractor.cpp | 2 + training/unicharset_training_utils.cpp | 38 +- wordrec/language_model.cpp | 166 +++-- wordrec/lm_pain_points.cpp | 2 +- wordrec/lm_state.h | 6 +- wordrec/outlines.cpp | 2 +- wordrec/pieces.cpp | 8 +- 80 files changed, 1061 insertions(+), 1180 deletions(-) diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 6cdf305f9b..d88bdbfd92 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -41,11 +41,11 @@ #include #endif // _WIN32 +#include #include -#include #include -#include -#include // std::unique_ptr +#include // std::unique_ptr +#include #include "allheaders.h" @@ -1540,7 +1540,8 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (bold) hocr_str += ""; if (italic) hocr_str += ""; do { - const std::unique_ptr grapheme(res_it->GetUTF8Text(RIL_SYMBOL)); + const std::unique_ptr grapheme( + res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { hocr_str += HOcrEscape(grapheme.get()); } @@ -1662,7 +1663,8 @@ char* TessBaseAPI::GetTSVText(int page_number) { if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++; do { - tsv_str += std::unique_ptr(res_it->GetUTF8Text(RIL_SYMBOL)).get(); + tsv_str += + std::unique_ptr(res_it->GetUTF8Text(RIL_SYMBOL)).get(); res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); tsv_str += "\n"; // end of row @@ -1720,7 +1722,8 @@ char* TessBaseAPI::GetBoxText(int page_number) { do { int left, top, right, bottom; if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) { - const std::unique_ptr text(it->GetUTF8Text(RIL_SYMBOL)); + const std::unique_ptr text( + it->GetUTF8Text(RIL_SYMBOL)); // Tesseract uses space for recognition failure. Fix to a reject // character, kTesseractReject so we don't create illegal box files. for (int i = 0; text[i] != '\0'; ++i) { @@ -1728,8 +1731,7 @@ char* TessBaseAPI::GetBoxText(int page_number) { text[i] = kTesseractReject; } snprintf(result + output_length, total_length - output_length, - "%s %d %d %d %d %d\n", - text.get(), left, image_height_ - bottom, + "%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number); output_length += strlen(result + output_length); // Just in case... @@ -2063,8 +2065,7 @@ void TessBaseAPI::End() { delete paragraph_models_; paragraph_models_ = NULL; } - if (osd_tesseract_ == tesseract_) - osd_tesseract_ = nullptr; + if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr; delete tesseract_; tesseract_ = nullptr; delete osd_tesseract_; diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index 36383c29e3..14eac7ec54 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -20,7 +20,7 @@ #include "config_auto.h" #endif -#include // std::unique_ptr +#include // std::unique_ptr #include "allheaders.h" #include "baseapi.h" #include "math.h" @@ -457,13 +457,12 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, STRING pdf_word(""); int pdf_word_len = 0; do { - const std::unique_ptr grapheme(res_it->GetUTF8Text(RIL_SYMBOL)); + const std::unique_ptr grapheme( + res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != '\0') { - GenericVector unicodes; - UNICHAR::UTF8ToUnicode(grapheme.get(), &unicodes); + std::vector unicodes = UNICHAR::UTF8ToUTF32(grapheme.get()); char utf16[kMaxBytesPerCodepoint]; - for (int i = 0; i < unicodes.length(); i++) { - int code = unicodes[i]; + for (char32 code : unicodes) { if (CodepointToUtf16be(code, utf16)) { pdf_word += utf16; pdf_word_len++; @@ -566,13 +565,13 @@ bool TessPDFRenderer::BeginDocumentHandler() { // CIDTOGIDMAP const int kCIDToGIDMapSize = 2 * (1 << 16); - const std::unique_ptr cidtogidmap(new unsigned char[kCIDToGIDMapSize]); + const std::unique_ptr cidtogidmap( + new unsigned char[kCIDToGIDMapSize]); for (int i = 0; i < kCIDToGIDMapSize; i++) { cidtogidmap[i] = (i % 2) ? 1 : 0; } size_t len; - unsigned char *comp = - zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len); + unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len); n = snprintf(buf, sizeof(buf), "5 0 obj\n" "<<\n" @@ -665,8 +664,8 @@ bool TessPDFRenderer::BeginDocumentHandler() { fseek(fp, 0, SEEK_END); long int size = ftell(fp); fseek(fp, 0, SEEK_SET); - const std::unique_ptr buffer(new char[size]); - if (fread(buffer.get(), 1, size, fp) != static_cast(size)) { + const std::unique_ptr buffer(new char[size]); + if (fread(buffer.get(), 1, size, fp) != static_cast(size)) { fclose(fp); return false; } @@ -879,11 +878,11 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { AppendPDFObject(buf); // CONTENTS - const std::unique_ptr pdftext(GetPDFTextObjects(api, width, height)); - const long pdftext_len = strlen(pdftext.get()); + const std::unique_ptr pdftext(GetPDFTextObjects(api, width, height)); + const size_t pdftext_len = strlen(pdftext.get()); size_t len; - unsigned char *comp_pdftext = - zlibCompress(reinterpret_cast(pdftext.get()), pdftext_len, &len); + unsigned char *comp_pdftext = zlibCompress( + reinterpret_cast(pdftext.get()), pdftext_len, &len); long comp_pdftext_len = len; n = snprintf(buf, sizeof(buf), "%ld 0 obj\n" @@ -960,11 +959,9 @@ bool TessPDFRenderer::EndDocumentHandler() { // INFO STRING utf16_title = "FEFF"; // byte_order_marker - GenericVector unicodes; - UNICHAR::UTF8ToUnicode(title(), &unicodes); + std::vector unicodes = UNICHAR::UTF8ToUTF32(title()); char utf16[kMaxBytesPerCodepoint]; - for (int i = 0; i < unicodes.length(); i++) { - int code = unicodes[i]; + for (char32 code : unicodes) { if (CodepointToUtf16be(code, utf16)) { utf16_title += utf16; } diff --git a/api/renderer.cpp b/api/renderer.cpp index 429d302097..a71f2c7245 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -19,8 +19,8 @@ #include "config_auto.h" #endif -#include // std::unique_ptr #include +#include // std::unique_ptr #include "baseapi.h" #include "genericvector.h" #include "renderer.h" diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index 3448f39df8..9a326c3ead 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -1,21 +1,21 @@ /********************************************************************** -* File: tesseractmain.cpp (Formerly tessedit.c) -* Description: Main program for merge of tess and editor. -* Author: Ray Smith -* Created: Tue Jan 07 15:21:46 GMT 1992 -* -* (C) Copyright 1992, Hewlett-Packard Ltd. -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** http://www.apache.org/licenses/LICENSE-2.0 -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -* -**********************************************************************/ + * File: tesseractmain.cpp (Formerly tessedit.c) + * Description: Main program for merge of tess and editor. + * Author: Ray Smith + * Created: Tue Jan 07 15:21:46 GMT 1992 + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ // Include automatically generated configuration file if running autoconf #ifdef HAVE_CONFIG_H @@ -404,7 +404,7 @@ int main(int argc, char** argv) { static GenericVector vars_vec; static GenericVector vars_values; -#ifdef NDEBUG +#if !defined(DEBUG) // Disable debugging and informational messages from Leptonica. setMsgSeverity(L_SEVERITY_ERROR); #endif @@ -431,7 +431,7 @@ int main(int argc, char** argv) { // first TessBaseAPI must be destructed, DawgCache must be the last object. tesseract::Dict::GlobalDawgCache(); - // Avoid memory leak caused by auto variable when exit() is called. + // Avoid memory leak caused by auto variable when return is called. static tesseract::TessBaseAPI api; api.SetOutputName(outputbase); diff --git a/ccmain/control.cpp b/ccmain/control.cpp index 4749a80291..d16dad983c 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -1878,11 +1878,11 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) { * * Find the modal font and remove from the stats. */ -static void find_modal_font( //good chars in word - STATS *fonts, //font stats - inT16 *font_out, //output font - int8_t *font_count //output count - ) { +static void find_modal_font( // good chars in word + STATS* fonts, // font stats + inT16* font_out, // output font + int8_t* font_count // output count +) { inT16 font; //font index inT32 count; //pile couat @@ -1999,7 +1999,7 @@ void Tesseract::font_recognition_pass(PAGE_RES* page_res) { } } inT16 doc_font; // modal font - int8_t doc_font_count; // modal font + int8_t doc_font_count; // modal font find_modal_font(&doc_fonts, &doc_font, &doc_font_count); if (doc_font_count == 0) return; diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp index ad7f228053..2a54b98308 100644 --- a/ccmain/docqual.cpp +++ b/ccmain/docqual.cpp @@ -511,9 +511,9 @@ BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word, int adjusted_len; int crunch_mode = 0; - if ((word->best_choice->unichar_string().length () == 0) || - (strspn (word->best_choice->unichar_string().string(), " ") == - word->best_choice->unichar_string().unsigned_size ())) + if ((word->best_choice->unichar_string().length() == 0) || + (strspn(word->best_choice->unichar_string().string(), " ") == + word->best_choice->unichar_string().unsigned_size())) crunch_mode = 1; else { adjusted_len = word->reject_map.length (); diff --git a/ccmain/equationdetect.cpp b/ccmain/equationdetect.cpp index 3c73418ae0..393b0e81e6 100644 --- a/ccmain/equationdetect.cpp +++ b/ccmain/equationdetect.cpp @@ -116,9 +116,7 @@ EquationDetect::EquationDetect(const char* equ_datapath, cps_super_bbox_ = NULL; } -EquationDetect::~EquationDetect() { - delete(cps_super_bbox_); -} +EquationDetect::~EquationDetect() { delete (cps_super_bbox_); } void EquationDetect::SetLangTesseract(Tesseract* lang_tesseract) { lang_tesseract_ = lang_tesseract; @@ -258,8 +256,8 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar( void EquationDetect::IdentifySpecialText() { // Set configuration for Tesseract::AdaptiveClassifier. - equ_tesseract_.tess_cn_matching.set_value(true); // turn it on - equ_tesseract_.tess_bn_matching.set_value(false); + equ_tesseract_.tess_cn_matching.set_value(1); // turn it on + equ_tesseract_.tess_bn_matching.set_value(0); // Set the multiplier to zero for lang_tesseract_ to improve the accuracy. int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier; diff --git a/ccmain/paragraphs.cpp b/ccmain/paragraphs.cpp index a8ef87be2c..6ea4eb4bf8 100644 --- a/ccmain/paragraphs.cpp +++ b/ccmain/paragraphs.cpp @@ -21,7 +21,7 @@ #endif #include -#include // std::unique_ptr +#include // std::unique_ptr #include "genericvector.h" #include "helpers.h" diff --git a/ccmain/paramsd.cpp b/ccmain/paramsd.cpp index dc1d124377..a5fccf88e8 100644 --- a/ccmain/paramsd.cpp +++ b/ccmain/paramsd.cpp @@ -183,10 +183,8 @@ void ParamsEditor::GetPrefixes(const char* s, STRING* level_one, // Compare two VC objects by their name. int ParamContent::Compare(const void* v1, const void* v2) { - const ParamContent* one = - *static_cast(v1); - const ParamContent* two = - *static_cast(v2); + const ParamContent* one = *static_cast(v1); + const ParamContent* two = *static_cast(v2); return strcmp(one->GetName(), two->GetName()); } diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp index a59cef168a..e8262159d2 100644 --- a/ccmain/pgedit.cpp +++ b/ccmain/pgedit.cpp @@ -544,7 +544,8 @@ BOOL8 Tesseract::process_cmd_win_event( // UI command semantics break; default: - sprintf(msg, "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value); + snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)", + cmd_event, new_value); image_win->AddMessage(msg); break; } diff --git a/ccmain/thresholder.cpp b/ccmain/thresholder.cpp index f7eed196ab..a4be3db3a5 100644 --- a/ccmain/thresholder.cpp +++ b/ccmain/thresholder.cpp @@ -311,8 +311,8 @@ void ImageThresholder::ThresholdRectToPix(Pix* src_pix, for (int x = 0; x < rect_width_; ++x) { bool white_result = true; for (int ch = 0; ch < num_channels; ++ch) { - int pixel = GET_DATA_BYTE(linedata, - (x + rect_left_) * num_channels + ch); + int pixel = + GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch); if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) { white_result = false; diff --git a/ccstruct/boxread.cpp b/ccstruct/boxread.cpp index d6ceebb4db..e2233e691e 100644 --- a/ccstruct/boxread.cpp +++ b/ccstruct/boxread.cpp @@ -206,7 +206,7 @@ bool ParseBoxFileStr(const char* boxfile_str, int* page_number, // Validate UTF8 by making unichars with it. int used = 0; while (used < uch_len) { - UNICHAR ch(uch + used, uch_len - used); + tesseract::UNICHAR ch(uch + used, uch_len - used); int new_used = ch.utf8_len(); if (new_used == 0) { tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", diff --git a/ccstruct/coutln.cpp b/ccstruct/coutln.cpp index ccd4b2faeb..974e452929 100644 --- a/ccstruct/coutln.cpp +++ b/ccstruct/coutln.cpp @@ -652,22 +652,10 @@ static void ComputeGradient(const l_uint32* data, int wpl, int x, int y, int width, int height, ICOORD* gradient) { const l_uint32* line = data + y * wpl; - int pix_x_y = - x < width && y < height - ? GET_DATA_BYTE(line, x) - : 255; - int pix_x_prevy = - x < width && y > 0 - ? GET_DATA_BYTE(line - wpl, x) - : 255; - int pix_prevx_prevy = - x > 0 && y > 0 - ? GET_DATA_BYTE(line - wpl, x - 1) - : 255; - int pix_prevx_y = - x > 0 && y < height - ? GET_DATA_BYTE(line, x - 1) - : 255; + int pix_x_y = x < width && y < height ? GET_DATA_BYTE(line, x) : 255; + int pix_x_prevy = x < width && y > 0 ? GET_DATA_BYTE(line - wpl, x) : 255; + int pix_prevx_prevy = x > 0 && y > 0 ? GET_DATA_BYTE(line - wpl, x - 1) : 255; + int pix_prevx_y = x > 0 && y < height ? GET_DATA_BYTE(line, x - 1) : 255; gradient->set_x(pix_x_y + pix_x_prevy - (pix_prevx_y + pix_prevx_prevy)); gradient->set_y(pix_x_prevy + pix_prevx_prevy - (pix_x_y + pix_prevx_y)); } diff --git a/ccstruct/coutln.h b/ccstruct/coutln.h index fbb63174c8..7b9265c717 100644 --- a/ccstruct/coutln.h +++ b/ccstruct/coutln.h @@ -1,7 +1,7 @@ /********************************************************************** - * File: coutln.h (Formerly: coutline.c) - * Description: Code for the C_OUTLINE class. - * Author: Ray Smith + * File: coutln.h (Formerly: + *coutline.c) Description: Code for the C_OUTLINE class. Author: + *Ray Smith * Created: Mon Oct 07 16:01:57 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. diff --git a/ccstruct/ocrblock.cpp b/ccstruct/ocrblock.cpp index 19f2aecbfd..61b6d1c969 100644 --- a/ccstruct/ocrblock.cpp +++ b/ccstruct/ocrblock.cpp @@ -17,10 +17,10 @@ * **********************************************************************/ +#include "ocrblock.h" #include -#include // std::unique_ptr +#include // std::unique_ptr #include "blckerr.h" -#include "ocrblock.h" #include "stepblob.h" #include "tprintf.h" @@ -381,7 +381,8 @@ void BLOCK::compute_row_margins() { TBOX row_box = row->bounding_box(); int left_y = row->base_line(row_box.left()) + row->x_height(); int left_margin; - const std::unique_ptr segments_left(lines.get_line(left_y)); + const std::unique_ptr segments_left( + lines.get_line(left_y)); LeftMargin(segments_left.get(), row_box.left(), &left_margin); if (row_box.top() >= drop_cap_bottom) { @@ -394,7 +395,8 @@ void BLOCK::compute_row_margins() { int right_y = row->base_line(row_box.right()) + row->x_height(); int right_margin; - const std::unique_ptr segments_right(lines.get_line(right_y)); + const std::unique_ptr segments_right( + lines.get_line(right_y)); RightMargin(segments_right.get(), row_box.right(), &right_margin); row->set_lmargin(left_margin); row->set_rmargin(right_margin); diff --git a/ccstruct/otsuthr.cpp b/ccstruct/otsuthr.cpp index e8113b65ac..386d91bd24 100644 --- a/ccstruct/otsuthr.cpp +++ b/ccstruct/otsuthr.cpp @@ -161,8 +161,7 @@ void HistogramRect(Pix* src_pix, int channel, for (int y = top; y < bottom; ++y) { const l_uint32* linedata = srcdata + y * src_wpl; for (int x = 0; x < width; ++x) { - int pixel = GET_DATA_BYTE(linedata, - (x + left) * num_channels + channel); + int pixel = GET_DATA_BYTE(linedata, (x + left) * num_channels + channel); ++histogram[pixel]; } } diff --git a/ccstruct/pdblock.cpp b/ccstruct/pdblock.cpp index d0be9d2860..648608c164 100644 --- a/ccstruct/pdblock.cpp +++ b/ccstruct/pdblock.cpp @@ -17,11 +17,11 @@ * **********************************************************************/ -#include -#include // std::unique_ptr -#include "allheaders.h" -#include "blckerr.h" -#include "pdblock.h" +#include "pdblock.h" +#include +#include // std::unique_ptr +#include "allheaders.h" +#include "blckerr.h" // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H @@ -141,7 +141,8 @@ Pix* PDBLK::render_mask(const FCOORD& rerotation, TBOX* mask_box) { // rasterized interior. (Runs of interior pixels on a line.) PB_LINE_IT *lines = new PB_LINE_IT(&image_block); for (int y = box.bottom(); y < box.top(); ++y) { - const std::unique_ptr segments(lines->get_line(y)); + const std::unique_ptr segments( + lines->get_line(y)); if (!segments->empty()) { ICOORDELT_IT s_it(segments.get()); // Each element of segments is a start x and x size of the @@ -196,7 +197,7 @@ void PDBLK::plot( //draw outline // serial,startpt.x(),startpt.y()); char temp_buff[34]; #if defined(__UNIX__) || defined(MINGW) - sprintf(temp_buff, "%" PRId32, serial); + snprintf(temp_buff, sizeof(temp_buff), "%" PRId32, serial); #else ultoa (serial, temp_buff, 10); #endif diff --git a/ccstruct/polyblk.cpp b/ccstruct/polyblk.cpp index 984b82afef..c4d8211ed1 100644 --- a/ccstruct/polyblk.cpp +++ b/ccstruct/polyblk.cpp @@ -17,12 +17,12 @@ * **********************************************************************/ +#include "polyblk.h" #include #include #include -#include // std::unique_ptr +#include // std::unique_ptr #include "elst.h" -#include "polyblk.h" // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H @@ -254,7 +254,7 @@ void POLY_BLOCK::plot(ScrollView* window, inT32 num) { window->TextAttributes("Times", 80, false, false, false); char temp_buff[34]; #if defined(__UNIX__) || defined(MINGW) - sprintf(temp_buff, "%" PRId32, num); + snprintf(temp_buff, sizeof(temp_buff), "%" PRId32, num); #else ltoa (num, temp_buff, 10); #endif @@ -281,9 +281,10 @@ void POLY_BLOCK::fill(ScrollView* window, ScrollView::Color colour) { for (y = this->bounding_box ()->bottom (); y <= this->bounding_box ()->top (); y++) { - const std::unique_ptr segments(lines->get_line (y)); + const std::unique_ptr segments( + lines->get_line(y)); if (!segments->empty ()) { - s_it.set_to_list (segments.get()); + s_it.set_to_list(segments.get()); for (s_it.mark_cycle_pt (); !s_it.cycled_list (); s_it.forward ()) { // Note different use of ICOORDELT, x coord is x coord of pixel // at the start of line segment, y coord is length of line segment diff --git a/ccstruct/ratngs.h b/ccstruct/ratngs.h index 7e658fa894..4d79b39ff1 100644 --- a/ccstruct/ratngs.h +++ b/ccstruct/ratngs.h @@ -190,10 +190,8 @@ class BLOB_CHOICE: public ELIST_LINK } // Sort function for sorting BLOB_CHOICEs in increasing order of rating. static int SortByRating(const void *p1, const void *p2) { - const BLOB_CHOICE *bc1 = - *static_cast(p1); - const BLOB_CHOICE *bc2 = - *static_cast(p2); + const BLOB_CHOICE *bc1 = *static_cast(p1); + const BLOB_CHOICE *bc2 = *static_cast(p2); return (bc1->rating_ < bc2->rating_) ? -1 : 1; } diff --git a/ccstruct/rejctmap.cpp b/ccstruct/rejctmap.cpp index 6870ce9a38..aee7fbc28a 100644 --- a/ccstruct/rejctmap.cpp +++ b/ccstruct/rejctmap.cpp @@ -264,65 +264,17 @@ void REJ::full_print(FILE *fp) { flag (R_MINIMAL_REJ_ACCEPT) ? "T" : "F"); } - -//The REJMAP class has been hacked to use malloc instead of new []. -//This is to reduce memory fragmentation only as it is rather kludgy. -// malloc by-passes the call to the constructor of REJ on each -// array element. Although the constructor is empty, the BITS16 members -// do have a constructor which sets all the flags to 0. The memset -// replaces this functionality. - -REJMAP::REJMAP( //classwise copy - const REJMAP &source) { - REJ *to; - REJ *from = source.ptr; - int i; - - len = source.length (); - - if (len > 0) { - ptr = (REJ *) malloc(len * sizeof (REJ)); - to = ptr; - for (i = 0; i < len; i++) { - *to = *from; - to++; - from++; - } - } - else - ptr = NULL; -} - - -REJMAP & REJMAP::operator= ( //assign REJMAP -const REJMAP & source //from this -) { - REJ * - to; - REJ * - from = source.ptr; - int - i; - - initialise (source.len); - to = ptr; - for (i = 0; i < len; i++) { - *to = *from; - to++; - from++; +REJMAP &REJMAP::operator=(const REJMAP &source) { + initialise(source.len); + for (int i = 0; i < len; i++) { + ptr[i] = source.ptr[i]; } return *this; } - -void REJMAP::initialise( //Redefine map - inT16 length) { - free(ptr); +void REJMAP::initialise(inT16 length) { + ptr.reset(new REJ[length]); len = length; - if (len > 0) - ptr = (REJ *) calloc(len, sizeof(REJ)); - else - ptr = NULL; } @@ -363,28 +315,12 @@ BOOL8 REJMAP::quality_recoverable_rejects() { //Any potential rejs? void REJMAP::remove_pos( //Cut out an element inT16 pos //element to remove ) { - REJ *new_ptr; //new, smaller map - int i; - ASSERT_HOST (pos >= 0); ASSERT_HOST (pos < len); ASSERT_HOST (len > 0); len--; - if (len > 0) - new_ptr = (REJ *) malloc(len * sizeof(REJ)); - else - new_ptr = NULL; - - for (i = 0; i < pos; i++) - new_ptr[i] = ptr[i]; //copy pre pos - - for (; pos < len; pos++) - new_ptr[pos] = ptr[pos + 1]; //copy post pos - - //delete old map - free(ptr); - ptr = new_ptr; + for (; pos < len; pos++) ptr[pos] = ptr[pos + 1]; } diff --git a/ccstruct/rejctmap.h b/ccstruct/rejctmap.h index 84b5009b34..732661db17 100644 --- a/ccstruct/rejctmap.h +++ b/ccstruct/rejctmap.h @@ -44,7 +44,7 @@ OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! #ifdef __UNIX__ #include #endif -#include "memry.h" +#include #include "bits16.h" #include "params.h" @@ -203,33 +203,24 @@ class REJ class REJMAP { - REJ *ptr; //ptr to the chars - inT16 len; //Number of chars + std::unique_ptr ptr; // ptr to the chars + inT16 len; // Number of chars - public: - REJMAP() { //constructor - ptr = NULL; - len = 0; - } + public: + REJMAP() : len(0) {} - REJMAP( //classwise copy - const REJMAP &rejmap); + REJMAP(const REJMAP &rejmap) { *this = rejmap; } - REJMAP & operator= ( //assign REJMAP - const REJMAP & source); //from this - - ~REJMAP () { //destructor - free(ptr); - } + REJMAP &operator=(const REJMAP &source); - void initialise( //Redefine map - inT16 length); + // Sets up the ptr array to length, whatever it was before. + void initialise(inT16 length); - REJ & operator[]( //access function - inT16 index) const //map index - { - ASSERT_HOST (index < len); - return ptr[index]; //no bounds checks + REJ &operator[]( // access function + inT16 index) const // map index + { + ASSERT_HOST(index < len); + return ptr[index]; // no bounds checks } inT32 length() const { //map length diff --git a/ccstruct/statistc.cpp b/ccstruct/statistc.cpp index e192ab52a2..bf275fdb25 100644 --- a/ccstruct/statistc.cpp +++ b/ccstruct/statistc.cpp @@ -89,9 +89,7 @@ void STATS::clear() { // clear out buckets * * Destructor for a stats class. **********************************************************************/ -STATS::~STATS () { - delete [] buckets_; -} +STATS::~STATS() { delete[] buckets_; } /********************************************************************** * STATS::add @@ -772,8 +770,8 @@ void swap_entries(void *array, // array of entries char *ptr2; size_t count; // of bytes - ptr1 = static_cast(array) + index1 * size; - ptr2 = static_cast(array) + index2 * size; + ptr1 = static_cast(array) + index1 * size; + ptr2 = static_cast(array) + index2 * size; for (count = 0; count < size; count++) { tmp = *ptr1; *ptr1++ = *ptr2; diff --git a/ccstruct/stepblob.h b/ccstruct/stepblob.h index a62c5bb0c9..5c63c13b16 100644 --- a/ccstruct/stepblob.h +++ b/ccstruct/stepblob.h @@ -117,8 +117,8 @@ class C_BLOB:public ELIST_LINK } static int SortByXMiddle(const void *v1, const void *v2) { - const C_BLOB* blob1 = *static_cast(v1); - const C_BLOB* blob2 = *static_cast(v2); + const C_BLOB* blob1 = *static_cast(v1); + const C_BLOB* blob2 = *static_cast(v2); return blob1->bounding_box().x_middle() - blob2->bounding_box().x_middle(); } diff --git a/ccutil/ambigs.cpp b/ccutil/ambigs.cpp index 4fe3b883a6..f65a6df8fb 100644 --- a/ccutil/ambigs.cpp +++ b/ccutil/ambigs.cpp @@ -25,8 +25,8 @@ #include "universalambigs.h" #if defined(_WIN32) && !defined(__GNUC__) -# define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr) -#endif /* _WIN32 && !__GNUC__ */ +#define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr) +#endif /* _WIN32 && !__GNUC__ */ namespace tesseract { diff --git a/ccutil/ambigs.h b/ccutil/ambigs.h index bc5965d80b..786d46073e 100644 --- a/ccutil/ambigs.h +++ b/ccutil/ambigs.h @@ -120,10 +120,8 @@ class AmbigSpec : public ELIST_LINK { // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. static int compare_ambig_specs(const void *spec1, const void *spec2) { - const AmbigSpec *s1 = - *static_cast(spec1); - const AmbigSpec *s2 = - *static_cast(spec2); + const AmbigSpec *s1 = *static_cast(spec1); + const AmbigSpec *s2 = *static_cast(spec2); int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); if (result != 0) return result; return UnicharIdArrayUtils::compare(s1->correct_fragments, diff --git a/ccutil/basedir.cpp b/ccutil/basedir.cpp index 67214b0e3d..12099af602 100644 --- a/ccutil/basedir.cpp +++ b/ccutil/basedir.cpp @@ -1,8 +1,7 @@ /********************************************************************** * File: basedir.cpp (Formerly getpath.c) - * Description: Find the directory location of the current executable using PATH. - * Author: Ray Smith - * Created: Mon Jul 09 09:06:39 BST 1990 + * Description: Find the directory location of the current executable using + *PATH. Author: Ray Smith Created: Mon Jul 09 09:06:39 BST 1990 * * (C) Copyright 1990, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/ccutil/genericvector.h b/ccutil/genericvector.h index bdea83d221..3c37b4aa71 100644 --- a/ccutil/genericvector.h +++ b/ccutil/genericvector.h @@ -37,9 +37,9 @@ template class GenericVector { public: - GenericVector() : size_used_(0), size_reserved_(0), data_(NULL), - clear_cb_(NULL), compare_cb_(NULL) {} - + GenericVector() { + init(kDefaultVectorSize); + } GenericVector(int size, T init_val) { init(size); init_to_size(size, init_val); @@ -73,10 +73,11 @@ class GenericVector { return size_used_; } // Workaround to avoid g++ -Wsign-compare warnings. - unsigned int unsigned_size() const { - static_assert(sizeof(size_used_) <= sizeof(unsigned int), ""); + size_t unsigned_size() const { + static_assert(sizeof(size_used_) <= sizeof(size_t), + "Wow! sizeof(size_t) < sizeof(int32_t)!!"); assert(0 <= size_used_); - return static_cast(size_used_); + return static_cast(size_used_); } int size_reserved() const { return size_reserved_; @@ -364,8 +365,7 @@ typedef bool (*FileWriter)(const GenericVector& data, const STRING& filename); // The default FileReader loads the whole file into the vector of char, // returning false on error. -inline bool LoadDataFromFile(const char *filename, - GenericVector* data) { +inline bool LoadDataFromFile(const char* filename, GenericVector* data) { bool result = false; FILE* fp = fopen(filename, "rb"); if (fp != NULL) { @@ -437,8 +437,8 @@ int sort_cmp(const void* t1, const void* t2) { // return > 0 if t1 > t2 template int sort_ptr_cmp(const void* t1, const void* t2) { - const T* a = *static_cast(t1); - const T* b = *static_cast(t2); + const T* a = *static_cast(t1); + const T* b = *static_cast(t2); if (*a < *b) { return -1; } else if (*b < *a) { @@ -896,7 +896,8 @@ bool GenericVector::write( } delete cb; } else { - if (fwrite(data_, sizeof(T), size_used_, f) != unsigned_size()) return false; + if (fwrite(data_, sizeof(T), size_used_, f) != unsigned_size()) + return false; } return true; } @@ -928,7 +929,8 @@ bool GenericVector::read( template bool GenericVector::Serialize(FILE* fp) const { if (fwrite(&size_used_, sizeof(size_used_), 1, fp) != 1) return false; - if (fwrite(data_, sizeof(*data_), size_used_, fp) != unsigned_size()) return false; + if (fwrite(data_, sizeof(*data_), size_used_, fp) != unsigned_size()) + return false; return true; } template diff --git a/ccutil/helpers.h b/ccutil/helpers.h index 6b9249fb5d..3fd0edcf57 100644 --- a/ccutil/helpers.h +++ b/ccutil/helpers.h @@ -182,7 +182,7 @@ inline int IntCastRounded(double x) { // Reverse the order of bytes in a n byte quantity for big/little-endian switch. inline void ReverseN(void* ptr, int num_bytes) { - char *cptr = static_cast(ptr); + char* cptr = static_cast(ptr); int halfsize = num_bytes / 2; for (int i = 0; i < halfsize; ++i) { char tmp = cptr[i]; diff --git a/ccutil/host.h b/ccutil/host.h index 5207495ed3..248e6cb994 100644 --- a/ccutil/host.h +++ b/ccutil/host.h @@ -27,8 +27,8 @@ #undef max #endif -#include // PRId32, ... -#include // int32_t, ... +#include // PRId32, ... +#include // int32_t, ... // definitions of portable data types (numbers and characters) typedef int8_t inT8; @@ -48,11 +48,11 @@ typedef unsigned char BOOL8; /* MinGW defines the standard PRI... macros, but MSVS doesn't. */ #if !defined(PRId32) -# define PRId32 "d" +#define PRId32 "d" #endif #if !defined(PRId64) -# define PRId64 "I64d" +#define PRId64 "I64d" #endif #endif /* _WIN32 */ diff --git a/ccutil/memry.cpp b/ccutil/memry.cpp index d0dfa231d0..2e4e1244a8 100644 --- a/ccutil/memry.cpp +++ b/ccutil/memry.cpp @@ -40,14 +40,6 @@ void *alloc_mem(inT32 count) { return malloc(static_cast(count)); } -void *alloc_big_zeros(inT32 count) { - return calloc(static_cast(count), 1); -} - void free_mem(void *oldchunk) { free(oldchunk); } - -void free_big_mem(void *oldchunk) { - free(oldchunk); -} diff --git a/ccutil/memry.h b/ccutil/memry.h index 0ec275718e..d179aa2cf2 100644 --- a/ccutil/memry.h +++ b/ccutil/memry.h @@ -29,11 +29,7 @@ extern char *alloc_string(inT32 count); extern void free_string(char *string); // get some memory extern void *alloc_mem(inT32 count); -// get some memory initialized to 0. -extern void *alloc_big_zeros(inT32 count); // free mem from alloc_mem extern void free_mem(void *oldchunk); -// free mem from alloc_big_zeros -extern void free_big_mem(void *oldchunk); #endif diff --git a/ccutil/params.cpp b/ccutil/params.cpp index 1ec2b6dc5b..7ea2189a5c 100644 --- a/ccutil/params.cpp +++ b/ccutil/params.cpp @@ -101,8 +101,8 @@ bool ParamUtils::SetParam(const char *name, const char* value, int intval; IntParam *ip = FindParam(name, GlobalParams()->int_params, member_params->int_params); - if (ip && ip->constraint_ok(constraint) && - sscanf(value, "%d", &intval) == 1) ip->set_value(intval); + if (ip && ip->constraint_ok(constraint) && sscanf(value, "%d", &intval) == 1) + ip->set_value(intval); // Look for the parameter among bool parameters. BoolParam *bp = FindParam(name, GlobalParams()->bool_params, diff --git a/ccutil/strngs.h b/ccutil/strngs.h index 17169e21ae..36224305c4 100644 --- a/ccutil/strngs.h +++ b/ccutil/strngs.h @@ -20,11 +20,11 @@ #ifndef STRNGS_H #define STRNGS_H -#include -#include -#include -#include "platform.h" -#include "memry.h" +#include +#include +#include +#include "memry.h" +#include "platform.h" namespace tesseract { class TFile; diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp index c0bd370460..048ff15824 100644 --- a/ccutil/tessdatamanager.cpp +++ b/ccutil/tessdatamanager.cpp @@ -33,6 +33,13 @@ namespace tesseract { +// Lazily loads from the the given filename. Won't actually read the file +// until it needs it. +void TessdataManager::LoadFileLater(const char *data_file_name) { + Clear(); + data_file_name_ = data_file_name; +} + bool TessdataManager::Init(const char *data_file_name) { GenericVector data; if (reader_ == nullptr) { @@ -46,6 +53,7 @@ bool TessdataManager::Init(const char *data_file_name) { // Loads from the given memory buffer as if a file. bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) { + Clear(); data_file_name_ = name; TFile fp; fp.Open(data, size); @@ -74,6 +82,14 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data, return true; } +// Overwrites a single entry of the given type. +void TessdataManager::OverwriteEntry(TessdataType type, const char *data, + int size) { + is_loaded_ = true; + entries_[type].resize_no_init(size); + memcpy(&entries_[type][0], data, size); +} + // Saves to the given filename. bool TessdataManager::SaveFile(const STRING &filename, FileWriter writer) const { diff --git a/ccutil/tessdatamanager.h b/ccutil/tessdatamanager.h index 1c736663cd..db9c5583f7 100644 --- a/ccutil/tessdatamanager.h +++ b/ccutil/tessdatamanager.h @@ -128,6 +128,9 @@ class TessdataManager { bool swap() const { return swap_; } bool is_loaded() const { return is_loaded_; } + // Lazily loads from the the given filename. Won't actually read the file + // until it needs it. + void LoadFileLater(const char *data_file_name); /** * Opens and reads the given data file right now. * @return true on success. @@ -136,6 +139,8 @@ class TessdataManager { // Loads from the given memory buffer as if a file, remembering name as some // arbitrary source id for caching. bool LoadMemBuffer(const char *name, const char *data, int size); + // Overwrites a single entry of the given type. + void OverwriteEntry(TessdataType type, const char *data, int size); // Saves to the given filename. bool SaveFile(const STRING &filename, FileWriter writer) const; diff --git a/ccutil/unichar.cpp b/ccutil/unichar.cpp index 0ceced13f0..255136f3ff 100644 --- a/ccutil/unichar.cpp +++ b/ccutil/unichar.cpp @@ -24,6 +24,8 @@ #define UNI_MAX_LEGAL_UTF32 0x0010FFFF +namespace tesseract { + // Construct from a utf8 string. If len<0 then the string is null terminated. // If the string is too long to fit in the UNICHAR then it takes only what // will fit. Checks for illegal input and stops at an illegal sequence. @@ -206,20 +208,21 @@ UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) { } // Converts a utf-8 string to a vector of unicodes. -// Returns false if the input contains invalid UTF-8, and replaces -// the rest of the string with a single space. -bool UNICHAR::UTF8ToUnicode(const char* utf8_str, - GenericVector* unicodes) { +// Returns an empty vector if the input contains invalid UTF-8. +/* static */ +std::vector UNICHAR::UTF8ToUTF32(const char* utf8_str) { const int utf8_length = strlen(utf8_str); + std::vector unicodes; + unicodes.reserve(utf8_length); const_iterator end_it(end(utf8_str, utf8_length)); for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) { if (it.is_legal()) { - unicodes->push_back(*it); + unicodes.push_back(*it); } else { - unicodes->push_back(' '); - return false; + unicodes.clear(); + return unicodes; } } - return true; + return unicodes; } diff --git a/ccutil/unichar.h b/ccutil/unichar.h index 85dde6f268..a1aef5a897 100644 --- a/ccutil/unichar.h +++ b/ccutil/unichar.h @@ -22,13 +22,14 @@ #include #include - -template class GenericVector; +#include +#include // Maximum number of characters that can be stored in a UNICHAR. Must be // at least 4. Must not exceed 31 without changing the coding of length. #define UNICHAR_LEN 30 +// TODO(rays) Move these to the tesseract namespace. // A UNICHAR_ID is the unique id of a unichar. typedef int UNICHAR_ID; @@ -45,6 +46,10 @@ enum StrongScriptDirection { // and right-to-left characters. }; +namespace tesseract { + +typedef signed int char32; + // The UNICHAR class holds a single classification result. This may be // a single Unicode character (stored as between 1 and 4 utf8 bytes) or // multiple Unicode characters representing the NFKC expansion of a ligature @@ -151,9 +156,11 @@ class UNICHAR { static const_iterator end(const char* utf8_str, const int byte_length); // Converts a utf-8 string to a vector of unicodes. - // Returns false if the input contains invalid UTF-8, and replaces - // the rest of the string with a single space. - static bool UTF8ToUnicode(const char* utf8_str, GenericVector* unicodes); + // Returns an empty vector if the input contains invalid UTF-8. + static std::vector UTF8ToUTF32(const char* utf8_str); + // Converts a vector of unicodes to a utf8 string. + // Returns an empty string if the input contains an invalid unicode. + static string UTF32ToUTF8(const std::vector& str32); private: // A UTF-8 representation of 1 or more Unicode characters. @@ -162,4 +169,6 @@ class UNICHAR { char chars[UNICHAR_LEN]; }; +} // namespace tesseract + #endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/ccutil/unicharcompress.cpp b/ccutil/unicharcompress.cpp index 3b8595cac2..969016e0ed 100644 --- a/ccutil/unicharcompress.cpp +++ b/ccutil/unicharcompress.cpp @@ -148,10 +148,10 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id, } RecodedCharID code; // Convert to unicodes. - GenericVector unicodes; + std::vector unicodes; if (u < unicharset.size() && - UNICHAR::UTF8ToUnicode(unicharset.get_normed_unichar(u), &unicodes) && - unicodes.size() == 1) { + (unicodes = UNICHAR::UTF8ToUTF32(unicharset.get_normed_unichar(u))) + .size() == 1) { // Check single unicodes for Hangul/Han and encode if so. int unicode = unicodes[0]; int leading, vowel, trailing; diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp index f36ac039de..aa87c127a4 100644 --- a/ccutil/unicharset.cpp +++ b/ccutil/unicharset.cpp @@ -29,6 +29,10 @@ #include "tprintf.h" #include "unichar.h" +// TODO(rays) Move UNICHARSET to tesseract namespace. +using tesseract::char32; +using tesseract::UNICHAR; + // Special character used in representing character fragments. static const char kSeparator = '|'; // Special character used in representing 'natural' character fragments. @@ -990,12 +994,9 @@ bool UNICHARSET::AnyRepeatedUnicodes() const { if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT; for (int id = start_id; id < size_used; ++id) { // Convert to unicodes. - GenericVector unicodes; - if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) && - unicodes.size() > 1) { - for (int u = 1; u < unicodes.size(); ++u) { - if (unicodes[u - 1] == unicodes[u]) return true; - } + std::vector unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id)); + for (int u = 1; u < unicodes.size(); ++u) { + if (unicodes[u - 1] == unicodes[u]) return true; } } return false; @@ -1013,7 +1014,8 @@ int UNICHARSET::add_script(const char* script) { assert(script_table_size_used == script_table_size_reserved); script_table_size_reserved += script_table_size_reserved; char** new_script_table = new char*[script_table_size_reserved]; - memcpy(new_script_table, script_table, script_table_size_used * sizeof(char*)); + memcpy(new_script_table, script_table, + script_table_size_used * sizeof(char*)); delete[] script_table; script_table = new_script_table; } diff --git a/classify/adaptive.cpp b/classify/adaptive.cpp index 7483a74fee..54157d0e80 100644 --- a/classify/adaptive.cpp +++ b/classify/adaptive.cpp @@ -221,7 +221,7 @@ void free_adapted_templates(ADAPT_TEMPLATES templates) { TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId) { int NumProtos = MaxProtoId + 1; - TEMP_CONFIG Config = (TEMP_CONFIG) malloc(sizeof(TEMP_CONFIG_STRUCT)); + TEMP_CONFIG Config = (TEMP_CONFIG)malloc(sizeof(TEMP_CONFIG_STRUCT)); Config->Protos = NewBitVector (NumProtos); Config->NumTimesSeen = 1; @@ -246,7 +246,7 @@ TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId) { * @note History: Thu Mar 14 13:31:31 1991, DSJ, Created. */ TEMP_PROTO NewTempProto() { - return (TEMP_PROTO) malloc(sizeof(TEMP_PROTO_STRUCT)); + return (TEMP_PROTO)malloc(sizeof(TEMP_PROTO_STRUCT)); } /* NewTempProto */ @@ -325,7 +325,7 @@ ADAPT_CLASS ReadAdaptedClass(TFile *fp) { fp->FRead(&NumTempProtos, sizeof(int), 1); Class->TempProtos = NIL_LIST; for (i = 0; i < NumTempProtos; i++) { - TEMP_PROTO TempProto = (TEMP_PROTO) malloc(sizeof(TEMP_PROTO_STRUCT)); + TEMP_PROTO TempProto = (TEMP_PROTO)malloc(sizeof(TEMP_PROTO_STRUCT)); fp->FRead(TempProto, sizeof(TEMP_PROTO_STRUCT), 1); Class->TempProtos = push_last (Class->TempProtos, TempProto); } @@ -390,7 +390,7 @@ ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(TFile *fp) { * @note History: Tue Mar 19 14:25:26 1991, DSJ, Created. */ PERM_CONFIG ReadPermConfig(TFile *fp) { - PERM_CONFIG Config = (PERM_CONFIG) malloc(sizeof(PERM_CONFIG_STRUCT)); + PERM_CONFIG Config = (PERM_CONFIG)malloc(sizeof(PERM_CONFIG_STRUCT)); uinT8 NumAmbigs; fp->FRead(&NumAmbigs, sizeof(uinT8), 1); Config->Ambigs = new UNICHAR_ID[NumAmbigs + 1]; @@ -416,7 +416,7 @@ PERM_CONFIG ReadPermConfig(TFile *fp) { * @note History: Tue Mar 19 14:29:59 1991, DSJ, Created. */ TEMP_CONFIG ReadTempConfig(TFile *fp) { - TEMP_CONFIG Config = (TEMP_CONFIG) malloc(sizeof(TEMP_CONFIG_STRUCT)); + TEMP_CONFIG Config = (TEMP_CONFIG)malloc(sizeof(TEMP_CONFIG_STRUCT)); fp->FRead(Config, sizeof(TEMP_CONFIG_STRUCT), 1); Config->Protos = NewBitVector (Config->ProtoVectorSize * BITSINLONG); diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index ce62ee89b5..b796774d75 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -1980,7 +1980,7 @@ void Classify::MakePermanent(ADAPT_TEMPLATES Templates, // Initialize permanent config. Ambigs = GetAmbiguities(Blob, ClassId); - PERM_CONFIG Perm = (PERM_CONFIG) malloc(sizeof(PERM_CONFIG_STRUCT)); + PERM_CONFIG Perm = (PERM_CONFIG)malloc(sizeof(PERM_CONFIG_STRUCT)); Perm->Ambigs = Ambigs; Perm->FontinfoId = Config->FontinfoId; @@ -2241,11 +2241,9 @@ void Classify::ShowBestMatchFor(int shape_id, tprintf("Static Shape ID: %d\n", shape_id); ShowMatchDisplay(); - im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), - AllProtosOn, &config_mask, // TODO: or reinterpret_cast(&config_mask) anyway? - num_features, features, &cn_result, - classify_adapt_feature_threshold, - matcher_debug_flags, + im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, + &config_mask, num_features, features, &cn_result, + classify_adapt_feature_threshold, matcher_debug_flags, matcher_debug_separate_windows); UpdateMatchDisplay(); #endif // GRAPHICS_DISABLED diff --git a/classify/cluster.cpp b/classify/cluster.cpp index 9221b54c97..678f3aa674 100644 --- a/classify/cluster.cpp +++ b/classify/cluster.cpp @@ -592,12 +592,12 @@ void FreePrototype(void *arg) { //PROTOTYPE *Prototype) Prototype->Cluster->Prototype = FALSE; // deallocate the prototype statistics and then the prototype itself - free (Prototype->Distrib); - free (Prototype->Mean); + free(Prototype->Distrib); + free(Prototype->Mean); if (Prototype->Style != spherical) { - free (Prototype->Variance.Elliptical); - free (Prototype->Magnitude.Elliptical); - free (Prototype->Weight.Elliptical); + free(Prototype->Variance.Elliptical); + free(Prototype->Magnitude.Elliptical); + free(Prototype->Weight.Elliptical); } free(Prototype); } // FreePrototype @@ -1123,9 +1123,9 @@ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, if (TotalDims < N + 1 || TotalDims < 2) return NULL; const int kMatrixSize = N * N * sizeof(FLOAT32); - FLOAT32* Covariance = static_cast(Emalloc(kMatrixSize)); - FLOAT32* Inverse = static_cast(Emalloc(kMatrixSize)); - FLOAT32* Delta = static_cast(Emalloc(N * sizeof(FLOAT32))); + FLOAT32 *Covariance = static_cast(Emalloc(kMatrixSize)); + FLOAT32 *Inverse = static_cast(Emalloc(kMatrixSize)); + FLOAT32 *Delta = static_cast(Emalloc(N * sizeof(FLOAT32))); // Compute a new covariance matrix that only uses essential features. for (int i = 0; i < N; ++i) { int row_offset = i * N; @@ -1749,13 +1749,13 @@ BUCKETS *MakeBuckets(DISTRIBUTION Distribution, BOOL8 Symmetrical; // allocate memory needed for data structure - Buckets = static_cast(Emalloc(sizeof(BUCKETS))); + Buckets = static_cast(Emalloc(sizeof(BUCKETS))); Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); Buckets->SampleCount = SampleCount; Buckets->Confidence = Confidence; - Buckets->Count = static_cast( - Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32))); - Buckets->ExpectedCount = static_cast( + Buckets->Count = + static_cast(Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32))); + Buckets->ExpectedCount = static_cast( Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32))); // initialize simple fields diff --git a/classify/clusttool.cpp b/classify/clusttool.cpp index b1a3d4b1e4..37cb7c49d1 100644 --- a/classify/clusttool.cpp +++ b/classify/clusttool.cpp @@ -227,7 +227,7 @@ FLOAT32 *ReadNFloats(TFile *fp, uinT16 N, FLOAT32 Buffer[]) { bool needs_free = false; if (Buffer == NULL) { - Buffer = static_cast(Emalloc(N * sizeof(FLOAT32))); + Buffer = static_cast(Emalloc(N * sizeof(FLOAT32))); needs_free = true; } diff --git a/classify/featdefs.cpp b/classify/featdefs.cpp index 2e81290025..66c66ea9c8 100644 --- a/classify/featdefs.cpp +++ b/classify/featdefs.cpp @@ -290,13 +290,13 @@ CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, * - ILLEGAL_SHORT_NAME * @note History: Wed May 23 15:36:05 1990, DSJ, Created. */ -int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, - const char *ShortName) { +uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, + const char *ShortName) { int i; for (i = 0; i < FeatureDefs.NumFeatureTypes; i++) if (!strcmp ((FeatureDefs.FeatureDesc[i]->ShortName), ShortName)) - return (i); + return static_cast(i); DoError (ILLEGAL_SHORT_NAME, "Illegal short name for a feature"); return 0; diff --git a/classify/featdefs.h b/classify/featdefs.h index 7c168f3daa..1478efa189 100644 --- a/classify/featdefs.h +++ b/classify/featdefs.h @@ -70,8 +70,8 @@ void WriteCharDescription(const FEATURE_DEFS_STRUCT& FeatureDefs, CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File); -int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, - const char *ShortName); +uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, + const char *ShortName); /**---------------------------------------------------------------------------- Global Data Definitions and Declarations diff --git a/classify/kdtree.cpp b/classify/kdtree.cpp index e98228a754..8fe2e15c40 100644 --- a/classify/kdtree.cpp +++ b/classify/kdtree.cpp @@ -137,10 +137,8 @@ class KDTreeSearch { MinK results_; }; -KDTreeSearch::KDTreeSearch(KDTREE* tree, FLOAT32 *query_point, int k_closest) : - tree_(tree), - query_point_(query_point), - results_(MAXSEARCH, k_closest) { +KDTreeSearch::KDTreeSearch(KDTREE *tree, FLOAT32 *query_point, int k_closest) + : tree_(tree), query_point_(query_point), results_(MAXSEARCH, k_closest) { sb_min_ = new FLOAT32[tree->KeySize]; sb_max_ = new FLOAT32[tree->KeySize]; } @@ -166,8 +164,9 @@ void KDTreeSearch::Search(int *result_count, int count = results_.elements_count(); *result_count = count; for (int j = 0; j < count; j++) { - // TODO: why FLOAT64 here? - distances[j] = (FLOAT32) sqrt((FLOAT64)results_.elements()[j].key); + // Pre-cast to float64 as key is a template type and we have no control + // over its actual type. + distances[j] = (FLOAT32)sqrt((FLOAT64)results_.elements()[j].key); results[j] = results_.elements()[j].value; } } @@ -387,10 +386,7 @@ KDNODE *MakeKDNode(KDTREE *tree, FLOAT32 Key[], void *Data, int Index) { /*---------------------------------------------------------------------------*/ -void FreeKDNode(KDNODE *Node) { - free(Node); -} - +void FreeKDNode(KDNODE *Node) { free(Node); } /*---------------------------------------------------------------------------*/ /** @@ -405,8 +401,8 @@ void KDTreeSearch::SearchRec(int level, KDNODE *sub_tree) { if (!BoxIntersectsSearch(sb_min_, sb_max_)) return; - results_.insert(DistanceSquared(tree_->KeySize, tree_->KeyDesc, - query_point_, sub_tree->Key), + results_.insert(DistanceSquared(tree_->KeySize, tree_->KeyDesc, query_point_, + sub_tree->Key), sub_tree->Data); if (query_point_[level] < sub_tree->BranchPoint) { @@ -479,7 +475,7 @@ FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[]) { /// one wrap distance away from the query. bool KDTreeSearch::BoxIntersectsSearch(FLOAT32 *lower, FLOAT32 *upper) { FLOAT32 *query = query_point_; - // Why FLOAT64? + // Compute the sum in higher precision. FLOAT64 total_distance = 0.0; FLOAT64 radius_squared = results_.max_insertable_key() * results_.max_insertable_key(); diff --git a/classify/mfoutline.cpp b/classify/mfoutline.cpp index 3bd0916edd..3f41571b8f 100644 --- a/classify/mfoutline.cpp +++ b/classify/mfoutline.cpp @@ -219,10 +219,9 @@ void MarkDirectionChanges(MFOUTLINE Outline) { /*---------------------------------------------------------------------------*/ /** Return a new edge point for a micro-feature outline. */ MFEDGEPT *NewEdgePoint() { - return (MFEDGEPT *) malloc(sizeof(MFEDGEPT)); + return reinterpret_cast(malloc(sizeof(MFEDGEPT))); } - /*---------------------------------------------------------------------------*/ /** * This routine returns the next point in the micro-feature diff --git a/classify/ocrfeatures.cpp b/classify/ocrfeatures.cpp index 2dfd693043..8008d4aff7 100644 --- a/classify/ocrfeatures.cpp +++ b/classify/ocrfeatures.cpp @@ -56,9 +56,7 @@ BOOL8 AddFeature(FEATURE_SET FeatureSet, FEATURE Feature) { * @return none * @note History: Mon May 21 13:33:27 1990, DSJ, Created. */ -void FreeFeature(FEATURE Feature) { - free(Feature); -} /* FreeFeature */ +void FreeFeature(FEATURE Feature) { free(Feature); } /* FreeFeature */ /** * Release the memory consumed by the specified feature @@ -88,9 +86,8 @@ void FreeFeatureSet(FEATURE_SET FeatureSet) { FEATURE NewFeature(const FEATURE_DESC_STRUCT* FeatureDesc) { FEATURE Feature; - Feature = (FEATURE) malloc(sizeof(FEATURE_STRUCT) + - (FeatureDesc->NumParams - 1) * - sizeof (FLOAT32)); + Feature = (FEATURE)malloc(sizeof(FEATURE_STRUCT) + + (FeatureDesc->NumParams - 1) * sizeof(FLOAT32)); Feature->Type = FeatureDesc; return (Feature); diff --git a/classify/protos.cpp b/classify/protos.cpp index ab604de9a8..ee887563be 100644 --- a/classify/protos.cpp +++ b/classify/protos.cpp @@ -228,8 +228,7 @@ void FreeClassFields(CLASS_TYPE Class) { int i; if (Class) { - if (Class->MaxNumProtos > 0) - free(Class->Prototypes); + if (Class->MaxNumProtos > 0) free(Class->Prototypes); if (Class->MaxNumConfigs > 0) { for (i = 0; i < Class->NumConfigs; i++) FreeBitVector (Class->Configurations[i]); diff --git a/classify/shapetable.h b/classify/shapetable.h index 5d3f64cc97..f399989999 100644 --- a/classify/shapetable.h +++ b/classify/shapetable.h @@ -54,8 +54,8 @@ struct UnicharRating { // Sort function to sort ratings appropriately by descending rating. static int SortDescendingRating(const void* t1, const void* t2) { - const UnicharRating* a = static_cast(t1); - const UnicharRating* b = static_cast(t2); + const UnicharRating* a = static_cast(t1); + const UnicharRating* b = static_cast(t2); if (a->rating > b->rating) { return -1; } else if (a->rating < b->rating) { @@ -100,8 +100,8 @@ struct ShapeRating { // Sort function to sort ratings appropriately by descending rating. static int SortDescendingRating(const void* t1, const void* t2) { - const ShapeRating* a = static_cast(t1); - const ShapeRating* b = static_cast(t2); + const ShapeRating* a = static_cast(t1); + const ShapeRating* b = static_cast(t2); if (a->rating > b->rating) { return -1; } else if (a->rating < b->rating) { diff --git a/cutil/oldlist.cpp b/cutil/oldlist.cpp index 01988abcc2..54a0ea779f 100644 --- a/cutil/oldlist.cpp +++ b/cutil/oldlist.cpp @@ -40,40 +40,40 @@ To implement a STACK use: - push to add to the Stack l = push (l, (LIST) "jim"); - pop to remove items from the Stack l = pop (l); - first_node to access the head name = (char *) first_node (l); + push to add to the Stack l = push(l, (LIST)"jim"); + pop to remove items from the Stack l = pop(l); + first_node to access the head name = (char *)first_node(l); To implement a QUEUE use: - push_last to add to the Queue l = push_last (l, (LIST) "jim"); - pop remove items from the Queue l = pop (l); - first_node to access the head name = (char *) first_node (l); + push_last to add to the Queue l = push_last(l, (LIST)"x"); + pop remove items from the Queue l = pop(l); + first_node to access the head name = (char *)first_node (l); To implement LISP like functions use: - first_node CAR x = (int) first_node (l); - rest CDR l = list_rest (l); - push CONS l = push (l, (LIST) this); - last LAST x = last (l); - concat APPEND l = concat (r, s); - count LENGTH x = count (l); - search MEMBER if (search (l, x, NULL)) + first_node CAR x = (int)first_node(l); + rest CDR l = list_rest (l); + push CONS l = push(l, (LIST)this); + last LAST x = last(l); + concat APPEND l = concat(r, s); + count LENGTH x = count(l); + search MEMBER if (search(l, x, NULL)) To implement SETS use: - adjoin l = adjoin (l, x); - set_union l = set_union (r, s); - intersection l = intersection (r, s); - set_difference l = set_difference (r, s); - delete l = delete (s, x, NULL); - search if (search (l, x, NULL)) + adjoin l = adjoin(l, x); + set_union l = set_union(r, s); + intersection l = intersection(r, s); + set_difference l = set_difference(r, s); + delete l = delete(s, x, NULL); + search if (search(l, x, NULL)) To Implement Associated LISTS use: - lpush l = lpush (l, p); - assoc s = assoc (l, x); - adelete l = adelete (l, x); + lpush l = lpush(l, p); + assoc s = assoc(l, x); + adelete l = adelete(l, x); The following rules of closure exist for the functions provided. a = first_node (push (a, b)) @@ -83,14 +83,14 @@ ******************************************************************************/ #include "oldlist.h" -#include "structures.h" #include +#include "structures.h" /*---------------------------------------------------------------------- M a c r o s ----------------------------------------------------------------------*/ -#define add_on(l,x) l = push (l,first_node (x)) -#define next_one(l) l = list_rest (l) +#define add_on(l, x) l = push(l, first_node(x)) +#define next_one(l) l = list_rest(l) /*---------------------------------------------------------------------- F u n c t i o n s @@ -103,11 +103,10 @@ int count(LIST var_list) { int temp = 0; - iterate (var_list) temp += 1; + iterate(var_list) temp += 1; return (temp); } - /********************************************************************** * d e l e t e d * @@ -121,59 +120,53 @@ LIST delete_d(LIST list, void *key, int_compare is_equal) { LIST result = NIL_LIST; LIST last_one = NIL_LIST; - if (is_equal == NULL) - is_equal = is_same; + if (is_equal == NULL) is_equal = is_same; while (list != NIL_LIST) { - if (!(*is_equal) (first_node (list), key)) { + if (!(*is_equal)(first_node(list), key)) { if (last_one == NIL_LIST) { last_one = list; - list = list_rest (list); + list = list_rest(list); result = last_one; set_rest(last_one, NIL_LIST); - } - else { + } else { set_rest(last_one, list); last_one = list; - list = list_rest (list); + list = list_rest(list); set_rest(last_one, NIL_LIST); } - } - else { - list = pop (list); + } else { + list = pop(list); } } return (result); } LIST delete_d(LIST list, void *key, - TessResultCallback2* is_equal) { + TessResultCallback2 *is_equal) { LIST result = NIL_LIST; LIST last_one = NIL_LIST; while (list != NIL_LIST) { - if (!(*is_equal).Run (first_node (list), key)) { + if (!(*is_equal).Run(first_node(list), key)) { if (last_one == NIL_LIST) { last_one = list; - list = list_rest (list); + list = list_rest(list); result = last_one; set_rest(last_one, NIL_LIST); - } - else { + } else { set_rest(last_one, list); last_one = list; - list = list_rest (list); + list = list_rest(list); set_rest(last_one, NIL_LIST); } - } - else { - list = pop (list); + } else { + list = pop(list); } } return (result); } - /********************************************************************** * d e s t r o y * @@ -183,21 +176,20 @@ LIST destroy(LIST list) { LIST next; while (list != NIL_LIST) { - next = list_rest (list); + next = list_rest(list); free_cell(list); list = next; } return (NIL_LIST); } - /********************************************************************** * d e s t r o y n o d e s * * Return the space taken by the LISTs of a list to the heap. **********************************************************************/ void destroy_nodes(LIST list, void_dest destructor) { - ASSERT_HOST(destructor != NULL); + ASSERT_HOST(destructor != nullptr); while (list != NIL_LIST) { if (first_node(list) != NULL) (*destructor)(first_node(list)); @@ -205,7 +197,6 @@ void destroy_nodes(LIST list, void_dest destructor) { } } - /********************************************************************** * i n s e r t * @@ -216,27 +207,15 @@ void insert(LIST list, void *node) { LIST element; if (list != NIL_LIST) { - element = push (NIL_LIST, node); - set_rest (element, list_rest (list)); + element = push(NIL_LIST, node); + set_rest(element, list_rest(list)); set_rest(list, element); - node = first_node (list); - list->node = first_node (list_rest (list)); - list->next->node = (LIST) node; + node = first_node(list); + list->node = first_node(list_rest(list)); + list->next->node = (LIST)node; } } - -/********************************************************************** - * i s s a m e n o d e - * - * Compare the list node with the key value return TRUE (non-zero) - * if they are equivalent strings. (Return FALSE if not) - **********************************************************************/ -int is_same_node(void *item1, void *item2) { - return (item1 == item2); -} - - /********************************************************************** * i s s a m e * @@ -244,10 +223,9 @@ int is_same_node(void *item1, void *item2) { * if they are equivalent strings. (Return FALSE if not) **********************************************************************/ int is_same(void *item1, void *item2) { - return (!strcmp ((char *) item1, (char *) item2)); + return strcmp((char *)item1, (char *)item2) == 0 ? 1 : 0; } - /********************************************************************** * j o i n * @@ -256,25 +234,21 @@ int is_same(void *item1, void *item2) { * first list updated. **********************************************************************/ LIST join(LIST list1, LIST list2) { - if (list1 == NIL_LIST) - return (list2); - set_rest (last (list1), list2); + if (list1 == NIL_LIST) return (list2); + set_rest(last(list1), list2); return (list1); } - /********************************************************************** * l a s t * * Return the last list item (this is list type). **********************************************************************/ LIST last(LIST var_list) { - while (list_rest (var_list) != NIL_LIST) - var_list = list_rest (var_list); + while (list_rest(var_list) != NIL_LIST) var_list = list_rest(var_list); return (var_list); } - /********************************************************************** * n t h c e l l * @@ -283,13 +257,11 @@ LIST last(LIST var_list) { void *nth_cell(LIST var_list, int item_num) { int x = 0; iterate(var_list) { - if (x++ == item_num) - return (var_list); + if (x++ == item_num) return (var_list); } return (var_list); } - /********************************************************************** * p o p * @@ -299,7 +271,7 @@ void *nth_cell(LIST var_list, int item_num) { LIST pop(LIST list) { LIST temp; - temp = list_rest (list); + temp = list_rest(list); if (list != NIL_LIST) { free_cell(list); @@ -307,7 +279,6 @@ LIST pop(LIST list) { return (temp); } - /********************************************************************** * p u s h * @@ -317,13 +288,12 @@ LIST pop(LIST list) { LIST push(LIST list, void *element) { LIST t; - t = new_cell (); - t->node = (LIST) element; + t = new_cell(); + t->node = (LIST)element; set_rest(t, list); return (t); } - /********************************************************************** * p u s h l a s t * @@ -333,15 +303,13 @@ LIST push_last(LIST list, void *item) { LIST t; if (list != NIL_LIST) { - t = last (list); - t->next = push (NIL_LIST, item); + t = last(list); + t->next = push(NIL_LIST, item); return (list); - } - else - return (push (NIL_LIST, item)); + } else + return (push(NIL_LIST, item)); } - /********************************************************************** * r e v e r s e * @@ -351,11 +319,10 @@ LIST push_last(LIST list, void *item) { LIST reverse(LIST list) { LIST newlist = NIL_LIST; - iterate (list) copy_first (list, newlist); + iterate(list) copy_first(list, newlist); return (newlist); } - /********************************************************************** * r e v e r s e d * @@ -363,12 +330,11 @@ LIST reverse(LIST list) { * destroyed. **********************************************************************/ LIST reverse_d(LIST list) { - LIST result = reverse (list); + LIST result = reverse(list); destroy(list); return (result); } - /********************************************************************** * s a d j o i n * @@ -379,12 +345,11 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) { LIST l; int result; - if (compare == NULL) - compare = (int_compare) strcmp; + if (compare == NULL) compare = (int_compare)strcmp; l = var_list; iterate(l) { - result = (*compare) (variable, first_node (l)); + result = (*compare)(variable, first_node(l)); if (result == 0) return (var_list); else if (result < 0) { @@ -392,7 +357,7 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) { return (var_list); } } - return (push_last (var_list, variable)); + return (push_last(var_list, variable)); } /********************************************************************** @@ -404,16 +369,14 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) { * for is_equal, the is_key routine will be used. **********************************************************************/ LIST search(LIST list, void *key, int_compare is_equal) { - if (is_equal == NULL) - is_equal = is_same; + if (is_equal == NULL) is_equal = is_same; - iterate (list) if ((*is_equal) (first_node (list), key)) - return (list); + iterate(list) if ((*is_equal)(first_node(list), key)) return (list); return (NIL_LIST); } -LIST search(LIST list, void *key, TessResultCallback2* is_equal) { - iterate (list) if ((*is_equal).Run(first_node (list), key)) - return (list); +LIST search(LIST list, void *key, + TessResultCallback2 *is_equal) { + iterate(list) if ((*is_equal).Run(first_node(list), key)) return (list); return (NIL_LIST); } diff --git a/cutil/oldlist.h b/cutil/oldlist.h index 508a69a4f2..bd4fdfa16f 100644 --- a/cutil/oldlist.h +++ b/cutil/oldlist.h @@ -70,8 +70,8 @@ * join - Concatenates list 1 and list 2. * delete_d - Removes the requested elements from the list. * transform_d - Modifies the list by applying a function to each node. - * insert - Add a new element into this spot in a list. (not NIL_LIST) - * push_last - Add a new element onto the end of a list. + * insert - Add a new element into this spot in a list. (not + *NIL_LIST) push_last - Add a new element onto the end of a list. * reverse_d - Reverse a list and destroy the old one. * * ASSOCIATED LISTS: @@ -249,8 +249,6 @@ void destroy_nodes(LIST list, void_dest destructor); void insert(LIST list, void *node); -int is_same_node(void *item1, void *item2); - int is_same(void *item1, void *item2); LIST join(LIST list1, LIST list2); diff --git a/dict/dawg.h b/dict/dawg.h index 23ac22168f..c36e7ba4fe 100644 --- a/dict/dawg.h +++ b/dict/dawg.h @@ -432,7 +432,7 @@ class SquishedDawg : public Dawg { num_forward_edges_in_node0 = num_forward_edges(0); if (debug_level > 3) print_all("SquishedDawg:"); } - ~SquishedDawg(); + virtual ~SquishedDawg(); // Loads using the given TFile. Returns false on failure. bool Load(TFile *fp) { diff --git a/dict/dict.cpp b/dict/dict.cpp index 96a1d4531f..4364eae4bd 100644 --- a/dict/dict.cpp +++ b/dict/dict.cpp @@ -241,7 +241,8 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) { if (load_bigram_dawg) { bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file); - if (bigram_dawg_) dawgs_ += bigram_dawg_; + // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the + // dawgs_!! } if (load_freq_dawg) { freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, @@ -352,6 +353,7 @@ void Dict::End() { delete dawgs_[i]; } } + dawg_cache_->FreeDawg(bigram_dawg_); if (dawg_cache_is_ours_) { delete dawg_cache_; dawg_cache_ = NULL; @@ -370,7 +372,7 @@ void Dict::End() { int Dict::def_letter_is_okay(void* void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const { - DawgArgs *dawg_args = static_cast(void_dawg_args); + DawgArgs *dawg_args = static_cast(void_dawg_args); if (dawg_debug_level >= 3) { tprintf("def_letter_is_okay: current unichar=%s word_end=%d" diff --git a/dict/dict.h b/dict/dict.h index ffba0c8c2d..a5b0817ea2 100644 --- a/dict/dict.h +++ b/dict/dict.h @@ -530,14 +530,14 @@ class Dict { DawgVector dawgs_; SuccessorListsVector successors_; Trie *pending_words_; + /// The following pointers are only cached for convenience. + /// The dawgs will be deleted when dawgs_ vector is destroyed. // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if // any of them are present on the best choices list for a word pair. // the bigrams are stored as space-separated words where: // (1) leading and trailing punctuation has been removed from each word and // (2) any digits have been replaced with '?' marks. Dawg *bigram_dawg_; - /// The following pointers are only cached for convenience. - /// The dawgs will be deleted when dawgs_ vector is destroyed. // TODO(daria): need to support multiple languages in the future, // so maybe will need to maintain a list of dawgs of each kind. Dawg *freq_dawg_; diff --git a/dict/permdawg.cpp b/dict/permdawg.cpp index f12a35b03a..57f1300094 100644 --- a/dict/permdawg.cpp +++ b/dict/permdawg.cpp @@ -53,7 +53,7 @@ void Dict::go_deeper_dawg_fxn( int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) { - DawgArgs *more_args = static_cast(void_more_args); + DawgArgs *more_args = static_cast(void_more_args); word_ending = (char_choice_index == char_choices.size()-1); int word_index = word->length() - 1; if (best_choice->rating() < *limit) return; diff --git a/dict/trie.cpp b/dict/trie.cpp index 5d2e3b4af6..a4406664d0 100644 --- a/dict/trie.cpp +++ b/dict/trie.cpp @@ -281,8 +281,8 @@ NODE_REF Trie::new_dawg_node() { // Sort function to sort words by decreasing order of length. static int sort_strings_by_dec_length(const void* v1, const void* v2) { - const STRING* s1 = static_cast(v1); - const STRING* s2 = static_cast(v2); + const STRING *s1 = static_cast(v1); + const STRING *s2 = static_cast(v2); return s2->length() - s1->length(); } diff --git a/opencl/openclwrapper.cpp b/opencl/openclwrapper.cpp index 79f95922ac..88e6d495c9 100644 --- a/opencl/openclwrapper.cpp +++ b/opencl/openclwrapper.cpp @@ -58,8 +58,9 @@ static const l_uint32 rmask32[] = { 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff}; -static cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers -static cl_mem pixThBuffer; //output from thresholdtopix calculation +static cl_mem pixsCLBuffer, pixdCLBuffer, + pixdCLIntermediate; // Morph operations buffers +static cl_mem pixThBuffer; // output from thresholdtopix calculation static cl_int clStatus; static KernelEnv rEnv; @@ -580,58 +581,58 @@ static ds_status writeProfileToFile(ds_profile *profile, } // substitute invalid characters in device name with _ -static void legalizeFileName( char *fileName) { - //printf("fileName: %s\n", fileName); - const char *invalidChars = - "/\?:*\"><| "; // space is valid but can cause headaches - // for each invalid char - for (unsigned i = 0; i < strlen(invalidChars); i++) { - char invalidStr[4]; - invalidStr[0] = invalidChars[i]; - invalidStr[1] = '\0'; - //printf("eliminating %s\n", invalidStr); - //char *pos = strstr(fileName, invalidStr); - // initial ./ is valid for present directory - //if (*pos == '.') pos++; - //if (*pos == '/') pos++; - for (char *pos = strstr(fileName, invalidStr); pos != nullptr; - pos = strstr(pos + 1, invalidStr)) { - // printf("\tfound: %s, ", pos); - pos[0] = '_'; - // printf("fileName: %s\n", fileName); - } +static void legalizeFileName(char *fileName) { + // printf("fileName: %s\n", fileName); + const char *invalidChars = + "/\?:*\"><| "; // space is valid but can cause headaches + // for each invalid char + for (unsigned i = 0; i < strlen(invalidChars); i++) { + char invalidStr[4]; + invalidStr[0] = invalidChars[i]; + invalidStr[1] = '\0'; + // printf("eliminating %s\n", invalidStr); + // char *pos = strstr(fileName, invalidStr); + // initial ./ is valid for present directory + // if (*pos == '.') pos++; + // if (*pos == '/') pos++; + for (char *pos = strstr(fileName, invalidStr); pos != nullptr; + pos = strstr(pos + 1, invalidStr)) { + // printf("\tfound: %s, ", pos); + pos[0] = '_'; + // printf("fileName: %s\n", fileName); } + } } -static void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) { - //printf("[DS] populateGPUEnvFromDevice\n"); - size_t size; - gpuInfo->mnIsUserCreated = 1; - // device - gpuInfo->mpDevID = device; - gpuInfo->mpArryDevsID = new cl_device_id[1]; - gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID; - clStatus = - clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE, - sizeof(cl_device_type), &gpuInfo->mDevType, &size); - CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)"); - // platform - clStatus = - clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM, - sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size); - CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)"); - // context - cl_context_properties props[3]; - props[0] = CL_CONTEXT_PLATFORM; - props[1] = (cl_context_properties) gpuInfo->mpPlatformID; - props[2] = 0; - gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, - nullptr, &clStatus); - CHECK_OPENCL( clStatus, "populateGPUEnv::createContext"); - // queue - cl_command_queue_properties queueProperties = 0; - gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus ); - CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue"); +static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) { + // printf("[DS] populateGPUEnvFromDevice\n"); + size_t size; + gpuInfo->mnIsUserCreated = 1; + // device + gpuInfo->mpDevID = device; + gpuInfo->mpArryDevsID = new cl_device_id[1]; + gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID; + clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE, + sizeof(cl_device_type), &gpuInfo->mDevType, &size); + CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)"); + // platform + clStatus = + clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM, + sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size); + CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)"); + // context + cl_context_properties props[3]; + props[0] = CL_CONTEXT_PLATFORM; + props[1] = (cl_context_properties)gpuInfo->mpPlatformID; + props[2] = 0; + gpuInfo->mpContext = + clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus); + CHECK_OPENCL(clStatus, "populateGPUEnv::createContext"); + // queue + cl_command_queue_properties queueProperties = 0; + gpuInfo->mpCmdQueue = clCreateCommandQueue( + gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus); + CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue"); } int OpenclDevice::LoadOpencl() @@ -662,27 +663,26 @@ int OpenclDevice::SetKernelEnv( KernelEnv *envInfo ) static cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, - cl_int *pStatus) -{ - cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags), - nElements * sizeof(l_uint32), hostbuffer, pStatus); + cl_int *pStatus) { + cl_mem membuffer = + clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags), + nElements * sizeof(l_uint32), hostbuffer, pStatus); - return membuffer; + return membuffer; } -static -Pix *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, Pix *pixd, Pix *pixs, - int elements, cl_mem_flags flags, bool memcopy = false, - bool sync = true) { +static Pix *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, Pix *pixd, + Pix *pixs, int elements, cl_mem_flags flags, + bool memcopy = false, bool sync = true) { PROCNAME("mapOutputCLBuffer"); if (!pixd) { if (memcopy) { if ((pixd = pixCreateTemplate(pixs)) == nullptr) - (Pix *)ERROR_PTR("pixd not made", procName, nullptr); + tprintf("pixd not made\n"); } else { if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == nullptr) - (Pix *)ERROR_PTR("pixd not made", procName, nullptr); + tprintf("pixd not made\n"); } } l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer( @@ -714,35 +714,34 @@ void OpenclDevice::releaseMorphCLBuffers() pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr; } -int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs) -{ - SetKernelEnv( &rEnv ); +int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs) { + SetKernelEnv(&rEnv); - if (pixThBuffer != nullptr) { - pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, - CL_MEM_ALLOC_HOST_PTR, &clStatus); + if (pixThBuffer != nullptr) { + pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, + CL_MEM_ALLOC_HOST_PTR, &clStatus); - // Get the output from ThresholdToPix operation - clStatus = - clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, - sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr); - } - else - { - //Get data from the source image - l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32)); - memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32)); + // Get the output from ThresholdToPix operation + clStatus = + clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, + sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr); + } else { + // Get data from the source image + l_uint32 *srcdata = + reinterpret_cast(malloc(wpl * h * sizeof(l_uint32))); + memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32)); - pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus); - } + pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, + CL_MEM_USE_HOST_PTR, &clStatus); + } - pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, - CL_MEM_ALLOC_HOST_PTR, &clStatus); + pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, + CL_MEM_ALLOC_HOST_PTR, &clStatus); - pixdCLIntermediate = allocateZeroCopyBuffer( - rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus); + pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, + CL_MEM_ALLOC_HOST_PTR, &clStatus); - return (int)clStatus; + return (int)clStatus; } int OpenclDevice::InitEnv() @@ -1255,254 +1254,222 @@ PERF_COUNT_END } //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels -static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) -{ - size_t globalThreads[2]; - cl_mem pixtemp; - cl_int status; - int gsize; - size_t localThreads[2]; +static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) { + size_t globalThreads[2]; + cl_mem pixtemp; + cl_int status; + int gsize; + size_t localThreads[2]; - //Horizontal pass - gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; - globalThreads[0] = gsize; - globalThreads[1] = GROUPSIZE_HMORY; - localThreads[0] = GROUPSIZE_HMORX; - localThreads[1] = GROUPSIZE_HMORY; - - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status ); - CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5"); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); - status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); + // Horizontal pass + gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX; + globalThreads[0] = gsize; + globalThreads[1] = GROUPSIZE_HMORY; + localThreads[0] = GROUPSIZE_HMORX; + localThreads[1] = GROUPSIZE_HMORY; - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, - nullptr, globalThreads, localThreads, 0, - nullptr, nullptr); + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status); + CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5"); - //Swap source and dest buffers - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); - //Vertical - gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; - globalThreads[0] = gsize; - gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; - globalThreads[1] = gsize; - localThreads[0] = GROUPSIZE_X; - localThreads[1] = GROUPSIZE_Y; + status = + clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, + globalThreads, localThreads, 0, nullptr, nullptr); - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status ); - CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5"); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); - status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, - nullptr, globalThreads, localThreads, 0, - nullptr, nullptr); + // Swap source and dest buffers + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; - return status; + // Vertical + gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X; + globalThreads[0] = gsize; + gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y; + globalThreads[1] = gsize; + localThreads[0] = GROUPSIZE_X; + localThreads[1] = GROUPSIZE_Y; + + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status); + CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5"); + + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); + status = + clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, + globalThreads, localThreads, 0, nullptr, nullptr); + + return status; } //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels -static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) -{ - size_t globalThreads[2]; - cl_mem pixtemp; - cl_int status; - int gsize; - l_uint32 fwmask, lwmask; - size_t localThreads[2]; +static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) { + size_t globalThreads[2]; + cl_mem pixtemp; + cl_int status; + int gsize; + l_uint32 fwmask, lwmask; + size_t localThreads[2]; - lwmask = lmask32[31 - 2]; - fwmask = rmask32[31 - 2]; + lwmask = lmask32[31 - 2]; + fwmask = rmask32[31 - 2]; - //Horizontal pass - gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; - globalThreads[0] = gsize; - globalThreads[1] = GROUPSIZE_HMORY; - localThreads[0] = GROUPSIZE_HMORX; - localThreads[1] = GROUPSIZE_HMORY; - - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status ); - CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5"); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); - status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); + // Horizontal pass + gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX; + globalThreads[0] = gsize; + globalThreads[1] = GROUPSIZE_HMORY; + localThreads[0] = GROUPSIZE_HMORX; + localThreads[1] = GROUPSIZE_HMORY; - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, - nullptr, globalThreads, localThreads, 0, - nullptr, nullptr); + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status); + CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5"); - //Swap source and dest buffers - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); - //Vertical - gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; - globalThreads[0] = gsize; - gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; - globalThreads[1] = gsize; - localThreads[0] = GROUPSIZE_X; - localThreads[1] = GROUPSIZE_Y; + status = + clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, + globalThreads, localThreads, 0, nullptr, nullptr); - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status ); - CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5"); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); - status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); - status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask); - status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, - nullptr, globalThreads, localThreads, 0, - nullptr, nullptr); + // Swap source and dest buffers + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; - return status; + // Vertical + gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X; + globalThreads[0] = gsize; + gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y; + globalThreads[1] = gsize; + localThreads[0] = GROUPSIZE_X; + localThreads[1] = GROUPSIZE_Y; + + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status); + CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5"); + + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask); + status = + clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, + globalThreads, localThreads, 0, nullptr, nullptr); + + return status; } //Morphology Dilate operation. Invokes the relevant OpenCL kernels -static cl_int -pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) -{ - l_int32 xp, yp, xn, yn; - SEL* sel; - size_t globalThreads[2]; - cl_mem pixtemp; - cl_int status; - int gsize; - size_t localThreads[2]; - char isEven; +static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, + l_int32 h) { + l_int32 xp, yp, xn, yn; + SEL *sel; + size_t globalThreads[2]; + cl_mem pixtemp; + cl_int status; + int gsize; + size_t localThreads[2]; + char isEven; - OpenclDevice::SetKernelEnv( &rEnv ); + OpenclDevice::SetKernelEnv(&rEnv); - if (hsize == 5 && vsize == 5) - { - //Specific case for 5x5 - status = pixDilateCL_55(wpl, h); - return status; - } + if (hsize == 5 && vsize == 5) { + // Specific case for 5x5 + status = pixDilateCL_55(wpl, h); + return status; + } - sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); + sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); - selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); - selDestroy(&sel); - //global and local work dimensions for Horizontal pass - gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; - globalThreads[0] = gsize; - gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; - globalThreads[1] = gsize; - localThreads[0] = GROUPSIZE_X; - localThreads[1] = GROUPSIZE_Y; + selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); + selDestroy(&sel); + // global and local work dimensions for Horizontal pass + gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X; + globalThreads[0] = gsize; + gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y; + globalThreads[1] = gsize; + localThreads[0] = GROUPSIZE_X; + localThreads[1] = GROUPSIZE_Y; - if (xp > 31 || xn > 31) - { - // Generic case. - rEnv.mpkKernel = - clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status); - CHECK_OPENCL(status, "clCreateKernel morphoDilateHor"); - - status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); - status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn); - status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl); - status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, - nullptr, globalThreads, localThreads, 0, - nullptr, nullptr); - - if (yp > 0 || yn > 0) { - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; - } - } - else if (xp > 0 || xn > 0 ) - { - // Specific Horizontal pass kernel for half width < 32 - rEnv.mpkKernel = - clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status); - CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word"); - isEven = (xp != xn); - - status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); - status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); - status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); - status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, - nullptr, globalThreads, localThreads, 0, - nullptr, nullptr); - - if (yp > 0 || yn > 0) { - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; - } + if (xp > 31 || xn > 31) { + // Generic case. + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status); + CHECK_OPENCL(status, "clCreateKernel morphoDilateHor"); + + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); + + if (yp > 0 || yn > 0) { + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; } + } else if (xp > 0 || xn > 0) { + // Specific Horizontal pass kernel for half width < 32 + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status); + CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word"); + isEven = (xp != xn); - if (yp > 0 || yn > 0) - { - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status ); - CHECK_OPENCL(status, "clCreateKernel morphoDilateVer"); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp); - status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); - status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); - status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, - nullptr, globalThreads, localThreads, 0, - nullptr, nullptr); + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); + + if (yp > 0 || yn > 0) { + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; } + } - return status; + if (yp > 0 || yn > 0) { + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status); + CHECK_OPENCL(status, "clCreateKernel morphoDilateVer"); + + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); + } + + return status; } //Morphology Erode operation. Invokes the relevant OpenCL kernels -static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) { +static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, + l_uint32 h) { l_int32 xp, yp, xn, yn; SEL *sel; size_t globalThreads[2]; @@ -1609,45 +1576,42 @@ static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) } //Morphology Open operation. Invokes the relevant OpenCL kernels -static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) -{ - cl_int status; - cl_mem pixtemp; +static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { + cl_int status; + cl_mem pixtemp; - //Erode followed by Dilate - status = pixErodeCL(hsize, vsize, wpl, h); + // Erode followed by Dilate + status = pixErodeCL(hsize, vsize, wpl, h); - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; - status = pixDilateCL(hsize, vsize, wpl, h); + status = pixDilateCL(hsize, vsize, wpl, h); - return status; + return status; } //Morphology Close operation. Invokes the relevant OpenCL kernels -static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) -{ - cl_int status; - cl_mem pixtemp; +static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) { + cl_int status; + cl_mem pixtemp; - //Dilate followed by Erode - status = pixDilateCL(hsize, vsize, wpl, h); + // Dilate followed by Erode + status = pixDilateCL(hsize, vsize, wpl, h); - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; - status = pixErodeCL(hsize, vsize, wpl, h); + status = pixErodeCL(hsize, vsize, wpl, h); - return status; + return status; } //output = buffer1 & ~(buffer2) -static -cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, - cl_mem buffer2, cl_mem outBuffer = nullptr) { +static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, + cl_mem buffer2, cl_mem outBuffer = nullptr) { cl_int status; size_t globalThreads[2]; int gsize; @@ -2055,83 +2019,91 @@ typedef struct _TessScoreEvaluationInputData { Pix *pix; } TessScoreEvaluationInputData; -static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) { - srand(1); - // 8.5x11 inches @ 300dpi rounded to clean multiples - int height = 3328; // %256 - int width = 2560; // %512 - int numChannels = 4; - input->height = height; - input->width = width; - input->numChannels = numChannels; - unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width]; - input->imageData = (unsigned char *) &imageData4[0]; - - // zero out image - unsigned char pixelWhite[4] = { 0, 0, 0, 255}; - unsigned char pixelBlack[4] = {255, 255, 255, 255}; - for (int p = 0; p < height*width; p++) { - //unsigned char tmp[4] = imageData4[0]; - imageData4[p][0] = pixelWhite[0]; - imageData4[p][1] = pixelWhite[1]; - imageData4[p][2] = pixelWhite[2]; - imageData4[p][3] = pixelWhite[3]; - } - // random lines to be eliminated - int maxLineWidth = 64; // pixels wide - int numLines = 10; - // vertical lines - for (int i = 0; i < numLines; i++) { - int lineWidth = rand()%maxLineWidth; - int vertLinePos = lineWidth + rand()%(width-2*lineWidth); - //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth); - for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) { - for (int col = 0; col < height; col++) { - //imageData4[row*width+col] = pixelBlack; - imageData4[row*width+col][0] = pixelBlack[0]; - imageData4[row*width+col][1] = pixelBlack[1]; - imageData4[row*width+col][2] = pixelBlack[2]; - imageData4[row*width+col][3] = pixelBlack[3]; - } - } +static void populateTessScoreEvaluationInputData( + TessScoreEvaluationInputData *input) { + srand(1); + // 8.5x11 inches @ 300dpi rounded to clean multiples + int height = 3328; // %256 + int width = 2560; // %512 + int numChannels = 4; + input->height = height; + input->width = width; + input->numChannels = numChannels; + unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc( + height * width * numChannels * + sizeof(unsigned char)); // new unsigned char[4][height*width]; + input->imageData = (unsigned char *)&imageData4[0]; + + // zero out image + unsigned char pixelWhite[4] = {0, 0, 0, 255}; + unsigned char pixelBlack[4] = {255, 255, 255, 255}; + for (int p = 0; p < height * width; p++) { + // unsigned char tmp[4] = imageData4[0]; + imageData4[p][0] = pixelWhite[0]; + imageData4[p][1] = pixelWhite[1]; + imageData4[p][2] = pixelWhite[2]; + imageData4[p][3] = pixelWhite[3]; + } + // random lines to be eliminated + int maxLineWidth = 64; // pixels wide + int numLines = 10; + // vertical lines + for (int i = 0; i < numLines; i++) { + int lineWidth = rand() % maxLineWidth; + int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth); + // printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth); + for (int row = vertLinePos - lineWidth / 2; + row < vertLinePos + lineWidth / 2; row++) { + for (int col = 0; col < height; col++) { + // imageData4[row*width+col] = pixelBlack; + imageData4[row * width + col][0] = pixelBlack[0]; + imageData4[row * width + col][1] = pixelBlack[1]; + imageData4[row * width + col][2] = pixelBlack[2]; + imageData4[row * width + col][3] = pixelBlack[3]; + } } - // horizontal lines - for (int i = 0; i < numLines; i++) { - int lineWidth = rand()%maxLineWidth; - int horLinePos = lineWidth + rand()%(height-2*lineWidth); - //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth); - for (int row = 0; row < width; row++) { - for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) { - //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col); - //imageData4[row*width+col] = pixelBlack; - imageData4[row*width+col][0] = pixelBlack[0]; - imageData4[row*width+col][1] = pixelBlack[1]; - imageData4[row*width+col][2] = pixelBlack[2]; - imageData4[row*width+col][3] = pixelBlack[3]; - } - } + } + // horizontal lines + for (int i = 0; i < numLines; i++) { + int lineWidth = rand() % maxLineWidth; + int horLinePos = lineWidth + rand() % (height - 2 * lineWidth); + // printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth); + for (int row = 0; row < width; row++) { + for (int col = horLinePos - lineWidth / 2; + col < horLinePos + lineWidth / 2; + col++) { // for (int row = vertLinePos-lineWidth/2; row < + // vertLinePos+lineWidth/2; row++) { + // printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col); + // imageData4[row*width+col] = pixelBlack; + imageData4[row * width + col][0] = pixelBlack[0]; + imageData4[row * width + col][1] = pixelBlack[1]; + imageData4[row * width + col][2] = pixelBlack[2]; + imageData4[row * width + col][3] = pixelBlack[3]; + } } - // spots (noise, squares) - float fractionBlack = 0.1; // how much of the image should be blackened - int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2); - for (int i = 0; i < numSpots; i++) { - int lineWidth = rand()%maxLineWidth; - int col = lineWidth + rand()%(width-2*lineWidth); - int row = lineWidth + rand()%(height-2*lineWidth); - //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col ); - for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) { - for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) { - //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c ); - //imageData4[row*width+col] = pixelBlack; - imageData4[r*width+c][0] = pixelBlack[0]; - imageData4[r*width+c][1] = pixelBlack[1]; - imageData4[r*width+c][2] = pixelBlack[2]; - imageData4[r*width+c][3] = pixelBlack[3]; - } - } + } + // spots (noise, squares) + float fractionBlack = 0.1; // how much of the image should be blackened + int numSpots = + (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2); + for (int i = 0; i < numSpots; i++) { + int lineWidth = rand() % maxLineWidth; + int col = lineWidth + rand() % (width - 2 * lineWidth); + int row = lineWidth + rand() % (height - 2 * lineWidth); + // printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col ); + for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) { + for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) { + // printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c ); + // imageData4[row*width+col] = pixelBlack; + imageData4[r * width + c][0] = pixelBlack[0]; + imageData4[r * width + c][1] = pixelBlack[1]; + imageData4[r * width + c][2] = pixelBlack[2]; + imageData4[r * width + c][3] = pixelBlack[3]; + } } + } - input->pix = pixCreate(input->width, input->height, 1); + input->pix = pixCreate(input->width, input->height, 1); } typedef struct _TessDeviceScore { @@ -2144,8 +2116,10 @@ typedef struct _TessDeviceScore { * Micro Benchmarks for Device Selection *****************************************************************************/ -static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) { - double time = 0; +static double composeRGBPixelMicroBench(GPUEnv *env, + TessScoreEvaluationInputData input, + ds_device_type type) { + double time = 0; #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); @@ -2226,8 +2200,10 @@ static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputDat return time; } -static double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { - double time; +static double histogramRectMicroBench(GPUEnv *env, + TessScoreEvaluationInputData input, + ds_device_type type) { + double time; #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); @@ -2305,16 +2281,14 @@ static double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData } //Reproducing the ThresholdRectToPix native version -static void ThresholdRectToPix_Native(const unsigned char* imagedata, - int bytes_per_pixel, - int bytes_per_line, - const int* thresholds, - const int* hi_values, - Pix** pix) { - int top = 0; - int left = 0; - int width = pixGetWidth(*pix); - int height = pixGetHeight(*pix); +static void ThresholdRectToPix_Native(const unsigned char *imagedata, + int bytes_per_pixel, int bytes_per_line, + const int *thresholds, + const int *hi_values, Pix **pix) { + int top = 0; + int left = 0; + int width = pixGetWidth(*pix); + int height = pixGetHeight(*pix); *pix = pixCreate(width, height, 1); uint32_t *pixdata = pixGetData(*pix); @@ -2342,8 +2316,10 @@ static void ThresholdRectToPix_Native(const unsigned char* imagedata, } } -static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) { - double time; +static double thresholdRectToPixMicroBench(GPUEnv *env, + TessScoreEvaluationInputData input, + ds_device_type type) { + double time; #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); @@ -2436,9 +2412,10 @@ static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInput return time; } -static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) { - - double time = 0; +static double getLineMasksMorphMicroBench(GPUEnv *env, + TessScoreEvaluationInputData input, + ds_device_type type) { + double time = 0; #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); @@ -2533,19 +2510,22 @@ static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputD #include "stdlib.h" // encode score object as byte string -static ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) { - *serializedScoreSize = sizeof(TessDeviceScore); - *serializedScore = new unsigned char[*serializedScoreSize]; - memcpy(*serializedScore, device->score, *serializedScoreSize); - return DS_SUCCESS; +static ds_status serializeScore(ds_device *device, void **serializedScore, + unsigned int *serializedScoreSize) { + *serializedScoreSize = sizeof(TessDeviceScore); + *serializedScore = new unsigned char[*serializedScoreSize]; + memcpy(*serializedScore, device->score, *serializedScoreSize); + return DS_SUCCESS; } // parses byte string and stores in score object -static ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) { - // check that serializedScoreSize == sizeof(TessDeviceScore); - device->score = new TessDeviceScore; - memcpy(device->score, serializedScore, serializedScoreSize); - return DS_SUCCESS; +static ds_status deserializeScore(ds_device *device, + const unsigned char *serializedScore, + unsigned int serializedScoreSize) { + // check that serializedScoreSize == sizeof(TessDeviceScore); + device->score = new TessDeviceScore; + memcpy(device->score, serializedScore, serializedScoreSize); + return DS_SUCCESS; } static ds_status releaseScore(void *score) { @@ -2554,58 +2534,68 @@ static ds_status releaseScore(void *score) { } // evaluate devices -static ds_status evaluateScoreForDevice( ds_device *device, void *inputData) { - // overwrite statuc gpuEnv w/ current device - // so native opencl calls can be used; they use static gpuEnv - printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); - GPUEnv *env = nullptr; - if (device->type == DS_DEVICE_OPENCL_DEVICE) { - env = new GPUEnv; - //printf("[DS] populating tmp GPUEnv from device\n"); - populateGPUEnvFromDevice( env, device->oclDeviceID); - env->mnFileCount = 0; //argc; - env->mnKernelCount = 0UL; - //printf("[DS] compiling kernels for tmp GPUEnv\n"); - OpenclDevice::gpuEnv = *env; - OpenclDevice::CompileKernelFile(env, ""); - } - - TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData; - - // pixReadTiff - double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type ); - - // HistogramRect - double histogramRectTime = histogramRectMicroBench( env, *input, device->type ); - - // ThresholdRectToPix - double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type ); - - // getLineMasks - double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type ); - - - // weigh times (% of cpu time) - // these weights should be the % execution time that the native cpu code took - float composeRGBPixelWeight = 1.2f; - float histogramRectWeight = 2.4f; - float thresholdRectToPixWeight = 4.5f; - float getLineMasksMorphWeight = 5.0f; - - float weightedTime = composeRGBPixelWeight * composeRGBPixelTime + - histogramRectWeight * histogramRectTime + - thresholdRectToPixWeight * thresholdRectToPixTime + - getLineMasksMorphWeight * getLineMasksMorphTime; - device->score = new TessDeviceScore; - ((TessDeviceScore *)device->score)->time = weightedTime; +static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) { + // overwrite statuc gpuEnv w/ current device + // so native opencl calls can be used; they use static gpuEnv + printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, + device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); + GPUEnv *env = nullptr; + if (device->type == DS_DEVICE_OPENCL_DEVICE) { + env = new GPUEnv; + // printf("[DS] populating tmp GPUEnv from device\n"); + populateGPUEnvFromDevice(env, device->oclDeviceID); + env->mnFileCount = 0; // argc; + env->mnKernelCount = 0UL; + // printf("[DS] compiling kernels for tmp GPUEnv\n"); + OpenclDevice::gpuEnv = *env; + OpenclDevice::CompileKernelFile(env, ""); + } - printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); - printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight ); - printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight ); - printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight ); - printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight ); - printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time ); - return DS_SUCCESS; + TessScoreEvaluationInputData *input = + static_cast(inputData); + + // pixReadTiff + double composeRGBPixelTime = + composeRGBPixelMicroBench(env, *input, device->type); + + // HistogramRect + double histogramRectTime = histogramRectMicroBench(env, *input, device->type); + + // ThresholdRectToPix + double thresholdRectToPixTime = + thresholdRectToPixMicroBench(env, *input, device->type); + + // getLineMasks + double getLineMasksMorphTime = + getLineMasksMorphMicroBench(env, *input, device->type); + + // weigh times (% of cpu time) + // these weights should be the % execution time that the native cpu code took + float composeRGBPixelWeight = 1.2f; + float histogramRectWeight = 2.4f; + float thresholdRectToPixWeight = 4.5f; + float getLineMasksMorphWeight = 5.0f; + + float weightedTime = composeRGBPixelWeight * composeRGBPixelTime + + histogramRectWeight * histogramRectTime + + thresholdRectToPixWeight * thresholdRectToPixTime + + getLineMasksMorphWeight * getLineMasksMorphTime; + device->score = new TessDeviceScore; + ((TessDeviceScore *)device->score)->time = weightedTime; + + printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, + device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); + printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, + composeRGBPixelWeight); + printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, + histogramRectWeight); + printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", + thresholdRectToPixTime, thresholdRectToPixWeight); + printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, + getLineMasksMorphWeight); + printf("[DS]%25s: %f\n", "Score", + static_cast(device->score)->time); + return DS_SUCCESS; } // initial call to select device diff --git a/opencl/openclwrapper.h b/opencl/openclwrapper.h index f150e6b5a1..5fe1a50023 100644 --- a/opencl/openclwrapper.h +++ b/opencl/openclwrapper.h @@ -242,7 +242,7 @@ class OpenclDevice /* OpenCL implementations of Morphological operations*/ //Initialiation of OCL buffers used in Morph operations - static int initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs); + static int initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs); static void releaseMorphCLBuffers(); static void pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline, diff --git a/textord/bbgrid.h b/textord/bbgrid.h index ba3b9d28c2..dcc017d4cf 100644 --- a/textord/bbgrid.h +++ b/textord/bbgrid.h @@ -372,8 +372,8 @@ template class GridSearch { template int SortByBoxLeft(const void* void1, const void* void2) { // The void*s are actually doubly indirected, so get rid of one level. - const BBC* p1 = *static_cast(void1); - const BBC* p2 = *static_cast(void2); + const BBC* p1 = *static_cast(void1); + const BBC* p2 = *static_cast(void2); int result = p1->bounding_box().left() - p2->bounding_box().left(); if (result != 0) return result; @@ -390,8 +390,8 @@ int SortByBoxLeft(const void* void1, const void* void2) { template int SortRightToLeft(const void* void1, const void* void2) { // The void*s are actually doubly indirected, so get rid of one level. - const BBC* p1 = *static_cast(void1); - const BBC* p2 = *static_cast(void2); + const BBC* p1 = *static_cast(void1); + const BBC* p2 = *static_cast(void2); int result = p2->bounding_box().right() - p1->bounding_box().right(); if (result != 0) return result; @@ -408,8 +408,8 @@ int SortRightToLeft(const void* void1, const void* void2) { template int SortByBoxBottom(const void* void1, const void* void2) { // The void*s are actually doubly indirected, so get rid of one level. - const BBC* p1 = *static_cast(void1); - const BBC* p2 = *static_cast(void2); + const BBC* p1 = *static_cast(void1); + const BBC* p2 = *static_cast(void2); int result = p1->bounding_box().bottom() - p2->bounding_box().bottom(); if (result != 0) return result; diff --git a/textord/colpartition.h b/textord/colpartition.h index 811175076c..7d799a5cec 100644 --- a/textord/colpartition.h +++ b/textord/colpartition.h @@ -706,10 +706,8 @@ class ColPartition : public ELIST2_LINK { // Sort function to sort by bounding box. static int SortByBBox(const void* p1, const void* p2) { - const ColPartition* part1 = - *static_cast(p1); - const ColPartition* part2 = - *static_cast(p2); + const ColPartition* part1 = *static_cast(p1); + const ColPartition* part2 = *static_cast(p2); int mid_y1 = part1->bounding_box_.y_middle(); int mid_y2 = part2->bounding_box_.y_middle(); if ((part2->bounding_box_.bottom() <= mid_y1 && diff --git a/textord/drawedg.cpp b/textord/drawedg.cpp index 96c4b55472..0a429b5483 100644 --- a/textord/drawedg.cpp +++ b/textord/drawedg.cpp @@ -1,8 +1,9 @@ /********************************************************************** * File: drawedg.cpp (Formerly drawedge.c) - * Description: Collection of functions to draw things to do with edge detection. - * Author: Ray Smith - * Created: Thu Jun 06 13:29:20 BST 1991 + * Description: Collection of functions to draw things to do with edge + * detection. + * Author: Ray Smith + * Created: Thu Jun 06 13:29:20 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/textord/makerow.cpp b/textord/makerow.cpp index c4aa55ba58..76b76818ac 100644 --- a/textord/makerow.cpp +++ b/textord/makerow.cpp @@ -813,7 +813,7 @@ void compute_line_occupation( //project blobs int32_t width = blob_box.right() - blob_box.left(); index = blob_box.bottom() - min_y; ASSERT_HOST(index >= 0 && index < line_count); - //count transitions + // count transitions deltas[index] += width; index = blob_box.top() - min_y; ASSERT_HOST(index >= 0 && index < line_count); diff --git a/textord/scanedg.cpp b/textord/scanedg.cpp index a9f27d55b5..84d7656a97 100644 --- a/textord/scanedg.cpp +++ b/textord/scanedg.cpp @@ -19,14 +19,14 @@ #include "scanedg.h" -#include // std::unique_ptr +#include // std::unique_ptr #include "allheaders.h" #include "edgloop.h" #define WHITE_PIX 1 /*thresholded colours */ #define BLACK_PIX 0 - /*W->B->W */ +// Flips between WHITE_PIX and BLACK_PIX. #define FLIP_COLOUR(pix) (1-(pix)) /********************************************************************** @@ -102,9 +102,10 @@ void make_margins( //get a line if (block->poly_block () != NULL) { lines = new PB_LINE_IT (block->poly_block ()); - const std::unique_ptr segments(lines->get_line (y)); + const std::unique_ptr segments( + lines->get_line(y)); if (!segments->empty ()) { - seg_it.set_to_list (segments.get()); + seg_it.set_to_list(segments.get()); seg_it.mark_cycle_pt (); start = seg_it.data ()->x (); xext = seg_it.data ()->y (); @@ -335,7 +336,7 @@ void join_edges(CRACKEDGE *edge1, // edges to join if (edge1->pos.x() + edge1->stepx != edge2->pos.x() || edge1->pos.y() + edge1->stepy != edge2->pos.y()) { CRACKEDGE *tempedge = edge1; - edge1 = edge2; // swap around + edge1 = edge2; // swap around edge2 = tempedge; } diff --git a/textord/tabvector.h b/textord/tabvector.h index b3e37ca601..949fdb1978 100644 --- a/textord/tabvector.h +++ b/textord/tabvector.h @@ -292,8 +292,8 @@ class TabVector : public ELIST2_LINK { // Sort function for E2LIST::sort to sort by sort_key_. static int SortVectorsByKey(const void* v1, const void* v2) { - const TabVector* tv1 = *static_cast(v1); - const TabVector* tv2 = *static_cast(v2); + const TabVector* tv1 = *static_cast(v1); + const TabVector* tv2 = *static_cast(v2); return tv1->sort_key_ - tv2->sort_key_; } diff --git a/training/boxchar.cpp b/training/boxchar.cpp index 0d51178ed1..a8e9c6c313 100644 --- a/training/boxchar.cpp +++ b/training/boxchar.cpp @@ -24,6 +24,7 @@ #include #include +#include #include "fileio.h" #include "genericvector.h" @@ -82,17 +83,16 @@ void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector* boxes) { int prev_i = -1; int max_shift = 0; - for (int i = 0; static_cast(i) < boxes->size(); ++i) { + for (size_t i = 0; i < boxes->size(); ++i) { Box* box = (*boxes)[i]->box_; if (box == nullptr) { - if (prev_i < 0 || prev_i < i - 1 || static_cast(i) + 1 == boxes->size()) { + if (prev_i < 0 || prev_i + 1 < i || i + 1 == boxes->size()) { // Erase null boxes at the start of a line and after another null box. do { delete (*boxes)[i]; boxes->erase(boxes->begin() + i); - --i; - } while (i >= 0 && static_cast(i) + 1 == boxes->size() && - (*boxes)[i]->box_ == nullptr); + if (i == 0) break; + } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr); } continue; } @@ -120,7 +120,7 @@ void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, x = 0; } } - if (prev_i == i - 1) { + if (prev_i + 1 == i) { // New character needed. BoxChar* new_box = new BoxChar("\t", 1); new_box->AddBox(x, y, width, height); @@ -146,7 +146,7 @@ void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector* boxes) { // After InsertNewlines, any remaining null boxes are not newlines, and are // singletons, so add a box to each remaining null box. - for (int i = 1; static_cast(i) + 1 < boxes->size(); ++i) { + for (size_t i = 1; i + 1 < boxes->size(); ++i) { Box* box = (*boxes)[i]->box_; if (box == nullptr) { Box* prev = (*boxes)[i - 1]->box_; @@ -178,8 +178,9 @@ void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, } // Left becomes the max right of all next boxes forward to the first // space or newline. - for (size_t j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != nullptr && - (*boxes)[j]->ch_ != "\t"; + for (size_t j = i + 2; + j < boxes->size() && (*boxes)[j]->box_ != nullptr && + (*boxes)[j]->ch_ != "\t"; ++j) { next = (*boxes)[j]->box_; if (next->x + next->w > left) { @@ -215,11 +216,12 @@ void BoxChar::ReorderRTLText(std::vector* boxes) { /* static */ bool BoxChar::ContainsMostlyRTL(const std::vector& boxes) { int num_rtl = 0, num_ltr = 0; - for (unsigned int i = 0; i < boxes.size(); ++i) { + for (size_t i = 0; i < boxes.size(); ++i) { // Convert the unichar to UTF32 representation - GenericVector uni_vector; - if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) { - tprintf("Illegal utf8 in boxchar %u string:%s = ", i, + std::vector uni_vector = + UNICHAR::UTF8ToUTF32(boxes[i]->ch_.c_str()); + if (uni_vector.empty()) { + tprintf("Illegal utf8 in boxchar %d string:%s = ", i, boxes[i]->ch_.c_str()); for (size_t c = 0; c < boxes[i]->ch_.size(); ++c) { tprintf(" 0x%x", boxes[i]->ch_[c]); @@ -227,8 +229,8 @@ bool BoxChar::ContainsMostlyRTL(const std::vector& boxes) { tprintf("\n"); continue; } - for (int j = 0; j < uni_vector.size(); ++j) { - UCharDirection dir = u_charDirection(uni_vector[j]); + for (char32 ch : uni_vector) { + UCharDirection dir = u_charDirection(ch); if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || dir == U_ARABIC_NUMBER) { ++num_rtl; @@ -263,7 +265,8 @@ bool BoxChar::MostlyVertical(const std::vector& boxes) { /* static */ int BoxChar::TotalByteLength(const std::vector& boxes) { int total_length = 0; - for (size_t i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size(); + for (size_t i = 0; i < boxes.size(); ++i) + total_length += boxes[i]->ch_.size(); return total_length; } diff --git a/training/commontraining.cpp b/training/commontraining.cpp index e080087909..36b79b3ff7 100644 --- a/training/commontraining.cpp +++ b/training/commontraining.cpp @@ -13,6 +13,9 @@ #include "commontraining.h" +#include +#include + #include "allheaders.h" #include "ccutil.h" #include "classify.h" @@ -35,9 +38,6 @@ #include "tprintf.h" #include "unicity_table.h" -#include -#include - using tesseract::CCUtil; using tesseract::IntFeatureSpace; using tesseract::ParamUtils; @@ -369,9 +369,8 @@ void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs, LABELEDLIST char_sample; FEATURE_SET feature_samples; CHAR_DESC char_desc; - int ShortNameToFeatureType_res = ShortNameToFeatureType(feature_defs, feature_name); - assert(0 <= ShortNameToFeatureType_res); - unsigned int feature_type = static_cast(ShortNameToFeatureType_res); + uint32_t feature_type = ShortNameToFeatureType(feature_defs, feature_name); + // Zero out the font_sample_count for all the classes. LIST it = *training_samples; iterate(it) { @@ -485,7 +484,8 @@ CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LIST FeatureList = nullptr; FEATURE_SET FeatureSet = nullptr; - int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type); + int32_t desc_index = + ShortNameToFeatureType(FeatureDefs, program_feature_type); N = FeatureDefs.FeatureDesc[desc_index]->NumParams; Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc); diff --git a/training/normstrngs.cpp b/training/normstrngs.cpp index 99f84b78e4..17fe5cf8b7 100644 --- a/training/normstrngs.cpp +++ b/training/normstrngs.cpp @@ -25,37 +25,12 @@ #include "unichar.h" #include "unicode/normalizer2.h" // From libicu #include "unicode/translit.h" // From libicu +#include "unicode/uchar.h" // From libicu #include "unicode/unorm2.h" // From libicu +#include "unicode/uscript.h" // From libicu namespace tesseract { -void UTF8ToUTF32(const char* utf8_str, GenericVector* str32) { - str32->clear(); - str32->reserve(strlen(utf8_str)); - int len = strlen(utf8_str); - int step = 0; - for (int ch = 0; ch < len; ch += step) { - step = UNICHAR::utf8_step(utf8_str + ch); - if (step > 0) { - UNICHAR uni_ch(utf8_str + ch, step); - (*str32) += uni_ch.first_uni(); - } - } -} - -void UTF32ToUTF8(const GenericVector& str32, STRING* utf8_str) { - utf8_str->ensure(str32.length()); - utf8_str->assign("", 0); - for (int i = 0; i < str32.length(); ++i) { - UNICHAR uni_ch(str32[i]); - char *utf8 = uni_ch.utf8_str(); - if (utf8 != nullptr) { - (*utf8_str) += utf8; - delete[] utf8; - } - } -} - bool is_hyphen_punc(const char32 ch) { static const int kNumHyphenPuncUnicodes = 13; static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = { @@ -171,42 +146,33 @@ bool IsOCREquivalent(char32 ch1, char32 ch2) { bool IsValidCodepoint(const char32 ch) { // In the range [0, 0xD800) or [0xE000, 0x10FFFF] - return (static_cast(ch) < 0xD800) - || (ch >= 0xE000 && ch <= 0x10FFFF); + return (static_cast(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF); } bool IsWhitespace(const char32 ch) { - ASSERT_HOST_MSG(IsValidCodepoint(ch), - "Invalid Unicode codepoint: 0x%x\n", ch); + ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n", + ch); return u_isUWhiteSpace(static_cast(ch)); } bool IsUTF8Whitespace(const char* text) { -#if 0 // intent return SpanUTF8Whitespace(text) == strlen(text); -#else // avoiding g++ -Wsign-compare warning - const int res = SpanUTF8Whitespace(text); - assert(0 <= res); - return static_cast(res) == strlen(text); -#endif } -int SpanUTF8Whitespace(const char* text) { +unsigned int SpanUTF8Whitespace(const char* text) { int n_white = 0; for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); - it != UNICHAR::end(text, strlen(text)); - ++it) { + it != UNICHAR::end(text, strlen(text)); ++it) { if (!IsWhitespace(*it)) break; n_white += it.utf8_len(); } return n_white; } -int SpanUTF8NotWhitespace(const char* text) { +unsigned int SpanUTF8NotWhitespace(const char* text) { int n_notwhite = 0; for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); - it != UNICHAR::end(text, strlen(text)); - ++it) { + it != UNICHAR::end(text, strlen(text)); ++it) { if (IsWhitespace(*it)) break; n_notwhite += it.utf8_len(); } @@ -215,33 +181,31 @@ int SpanUTF8NotWhitespace(const char* text) { bool IsInterchangeValid(const char32 ch) { return IsValidCodepoint(ch) && - !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters. - !(ch >= 0xFFFE && ch <= 0xFFFF) && - !(ch >= 0x1FFFE && ch <= 0x1FFFF) && - !(ch >= 0x2FFFE && ch <= 0x2FFFF) && - !(ch >= 0x3FFFE && ch <= 0x3FFFF) && - !(ch >= 0x4FFFE && ch <= 0x4FFFF) && - !(ch >= 0x5FFFE && ch <= 0x5FFFF) && - !(ch >= 0x6FFFE && ch <= 0x6FFFF) && - !(ch >= 0x7FFFE && ch <= 0x7FFFF) && - !(ch >= 0x8FFFE && ch <= 0x8FFFF) && - !(ch >= 0x9FFFE && ch <= 0x9FFFF) && - !(ch >= 0xAFFFE && ch <= 0xAFFFF) && - !(ch >= 0xBFFFE && ch <= 0xBFFFF) && - !(ch >= 0xCFFFE && ch <= 0xCFFFF) && - !(ch >= 0xDFFFE && ch <= 0xDFFFF) && - !(ch >= 0xEFFFE && ch <= 0xEFFFF) && - !(ch >= 0xFFFFE && ch <= 0xFFFFF) && - !(ch >= 0x10FFFE && ch <= 0x10FFFF) && - (!u_isISOControl(static_cast(ch)) || - ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r'); + !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters. + !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) && + !(ch >= 0x2FFFE && ch <= 0x2FFFF) && + !(ch >= 0x3FFFE && ch <= 0x3FFFF) && + !(ch >= 0x4FFFE && ch <= 0x4FFFF) && + !(ch >= 0x5FFFE && ch <= 0x5FFFF) && + !(ch >= 0x6FFFE && ch <= 0x6FFFF) && + !(ch >= 0x7FFFE && ch <= 0x7FFFF) && + !(ch >= 0x8FFFE && ch <= 0x8FFFF) && + !(ch >= 0x9FFFE && ch <= 0x9FFFF) && + !(ch >= 0xAFFFE && ch <= 0xAFFFF) && + !(ch >= 0xBFFFE && ch <= 0xBFFFF) && + !(ch >= 0xCFFFE && ch <= 0xCFFFF) && + !(ch >= 0xDFFFE && ch <= 0xDFFFF) && + !(ch >= 0xEFFFE && ch <= 0xEFFFF) && + !(ch >= 0xFFFFE && ch <= 0xFFFFF) && + !(ch >= 0x10FFFE && ch <= 0x10FFFF) && + (!u_isISOControl(static_cast(ch)) || ch == '\n' || + ch == '\f' || ch == '\t' || ch == '\r'); } bool IsInterchangeValid7BitAscii(const char32 ch) { - return IsValidCodepoint(ch) && - ch <= 128 && - (!u_isISOControl(static_cast(ch)) || - ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r'); + return IsValidCodepoint(ch) && ch <= 128 && + (!u_isISOControl(static_cast(ch)) || ch == '\n' || + ch == '\f' || ch == '\t' || ch == '\r'); } char32 FullwidthToHalfwidth(const char32 ch) { diff --git a/training/normstrngs.h b/training/normstrngs.h index 6fca3193ab..27f36e0981 100644 --- a/training/normstrngs.h +++ b/training/normstrngs.h @@ -50,7 +50,7 @@ inline STRING NormalizeUTF8String(const char* str8) { return NormalizeUTF8String(false, str8); } -// Apply just the OCR-specific normalizations and return the normalized char. +// Applies just the OCR-specific normalizations and return the normalized char. char32 OCRNormalize(char32 ch); // Returns true if the OCRNormalized ch1 and ch2 are the same. @@ -67,11 +67,11 @@ bool IsUTF8Whitespace(const char* text); // Returns the length of bytes of the prefix of 'text' that have the White_Space // unicode property. -int SpanUTF8Whitespace(const char* text); +unsigned int SpanUTF8Whitespace(const char* text); // Returns the length of bytes of the prefix of 'text' that DO NOT have the // White_Space unicode property. -int SpanUTF8NotWhitespace(const char* text); +unsigned int SpanUTF8NotWhitespace(const char* text); // Returns true if the char is interchange valid i.e. no C0 or C1 control codes // (other than CR LF HT FF) and no non-characters. diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp index b0474575ad..07d9077717 100644 --- a/training/pango_font_info.cpp +++ b/training/pango_font_info.cpp @@ -88,6 +88,10 @@ PangoFontInfo::PangoFontInfo(const string& desc) void PangoFontInfo::Clear() { font_size_ = 0; + is_bold_ = false; + is_italic_ = false; + is_smallcaps_ = false; + is_monospace_ = false; family_name_.clear(); font_type_ = UNKNOWN; if (desc_) { @@ -168,6 +172,29 @@ static void ListFontFamilies(PangoFontFamily*** families, pango_font_map_list_families(font_map, families, n_families); } +// Inspects whether a given font family is monospace. If the font is not +// available, it cannot make a decision and returns false by default. +static bool IsMonospaceFontFamily(const char* family_name) { + PangoFontFamily** families = 0; + int n_families = 0; + bool is_monospace = false; + ListFontFamilies(&families, &n_families); + ASSERT_HOST(n_families > 0); + bool found = false; + for (int i = 0; i < n_families; ++i) { + if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) { + is_monospace = pango_font_family_is_monospace(families[i]); + found = true; + break; + } + } + if (!found) { + tlog(1, "Could not find monospace property of family %s\n", family_name); + } + g_free(families); + return is_monospace; +} + bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) { Clear(); const char* family = pango_font_description_get_family(desc); @@ -180,6 +207,7 @@ bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) { } family_name_ = string(family); desc_ = pango_font_description_copy(desc); + is_monospace_ = IsMonospaceFontFamily(family); // Set font size in points font_size_ = pango_font_description_get_size(desc); @@ -187,6 +215,17 @@ bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) { font_size_ /= PANGO_SCALE; } + PangoStyle style = pango_font_description_get_style(desc); + is_italic_ = (PANGO_STYLE_ITALIC == style || + PANGO_STYLE_OBLIQUE == style); + is_smallcaps_ = (pango_font_description_get_variant(desc) + == PANGO_VARIANT_SMALL_CAPS); + + is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD); + // We don't have a way to detect whether a font is of type Fraktur. The fonts + // we currently use all have "Fraktur" in their family name, so we do a + // fragile but functional check for that here. + is_fraktur_ = (strcasestr(family, "Fraktur") != nullptr); return true; } diff --git a/training/pango_font_info.h b/training/pango_font_info.h index f435d04af0..af6ee98512 100644 --- a/training/pango_font_info.h +++ b/training/pango_font_info.h @@ -105,6 +105,11 @@ class PangoFontInfo { const string& family_name() const { return family_name_; } // Size in points (1/72"), rounded to the nearest integer. int font_size() const { return font_size_; } + bool is_bold() const { return is_bold_; } + bool is_italic() const { return is_italic_; } + bool is_smallcaps() const { return is_smallcaps_; } + bool is_monospace() const { return is_monospace_; } + bool is_fraktur() const { return is_fraktur_; } FontTypeEnum font_type() const { return font_type_; } int resolution() const { return resolution_; } @@ -123,6 +128,11 @@ class PangoFontInfo { // Font properties set automatically from parsing the font description name. string family_name_; int font_size_; + bool is_bold_; + bool is_italic_; + bool is_smallcaps_; + bool is_monospace_; + bool is_fraktur_; FontTypeEnum font_type_; // The Pango description that was used to initialize the instance. PangoFontDescription* desc_; diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp index 35aca8baee..382b292d81 100644 --- a/training/stringrenderer.cpp +++ b/training/stringrenderer.cpp @@ -141,11 +141,11 @@ void StringRenderer::set_resolution(const int resolution) { } void StringRenderer::set_underline_start_prob(const double frac) { - underline_start_prob_ = min(max(frac, 0.0), 1.0); + underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0); } void StringRenderer::set_underline_continuation_prob(const double frac) { - underline_continuation_prob_ = min(max(frac, 0.0), 1.0); + underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0); } StringRenderer::~StringRenderer() { @@ -191,6 +191,7 @@ void StringRenderer::SetLayoutProperties() { int max_height = page_height_ - 2 * v_margin_; tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height); if (vertical_text_) { + using std::swap; swap(max_width, max_height); } pango_layout_set_width(layout_, max_width * PANGO_SCALE); @@ -340,8 +341,7 @@ void StringRenderer::RotatePageBoxes(float rotation) { void StringRenderer::ClearBoxes() { - for (size_t i = 0; i < boxchars_.size(); ++i) - delete boxchars_[i]; + for (size_t i = 0; i < boxchars_.size(); ++i) delete boxchars_[i]; boxchars_.clear(); boxaDestroy(&page_boxes_); } @@ -433,10 +433,10 @@ static void MergeBoxCharsToWords(std::vector* boxchars) { // Compute bounding box union const Box* box = boxchars->at(i)->box(); Box* last_box = last_boxchar->mutable_box(); - int left = min(last_box->x, box->x); - int right = max(last_box->x + last_box->w, box->x + box->w); - int top = min(last_box->y, box->y); - int bottom = max(last_box->y + last_box->h, box->y + box->h); + int left = std::min(last_box->x, box->x); + int right = std::max(last_box->x + last_box->w, box->x + box->w); + int top = std::min(last_box->y, box->y); + int bottom = std::max(last_box->y + last_box->h, box->y + box->h); // Conclude that the word was broken to span multiple lines based on the // size of the merged bounding box in relation to those of the individual // characters seen so far. @@ -523,9 +523,9 @@ void StringRenderer::ComputeClusterBoxes() { "cluster_text:%s start_byte_index:%d\n", cluster_text.c_str(), start_byte_index); if (box_padding_) { - cluster_rect.x = max(0, cluster_rect.x - box_padding_); + cluster_rect.x = std::max(0, cluster_rect.x - box_padding_); cluster_rect.width += 2 * box_padding_; - cluster_rect.y = max(0, cluster_rect.y - box_padding_); + cluster_rect.y = std::max(0, cluster_rect.y - box_padding_); cluster_rect.height += 2 * box_padding_; } if (add_ligatures_) { @@ -865,8 +865,8 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage, tprintf("Total chars = %d\n", total_chars_); } const std::vector& all_fonts = FontUtils::ListAvailableFonts(); - assert(0 <= font_index_); - for (unsigned int i = static_cast(font_index_); i < all_fonts.size(); ++i) { + + for (size_t i = font_index_; i < all_fonts.size(); ++i) { ++font_index_; int raw_score = 0; int ok_chars = diff --git a/training/stringrenderer.h b/training/stringrenderer.h index e1144d4ee7..b6189ced6b 100644 --- a/training/stringrenderer.h +++ b/training/stringrenderer.h @@ -212,7 +212,7 @@ class StringRenderer { // Objects cached for subsequent calls to RenderAllFontsToImage() std::unordered_map char_map_; // Time-saving char histogram. int total_chars_; // Number in the string to be rendered. - int font_index_; // Index of next font to use in font list. + unsigned int font_index_; // Index of next font to use in font list. int last_offset_; // Offset returned from last successful rendering private: diff --git a/training/unicharset_extractor.cpp b/training/unicharset_extractor.cpp index 1e6c35afb3..e9954a4e87 100644 --- a/training/unicharset_extractor.cpp +++ b/training/unicharset_extractor.cpp @@ -38,6 +38,8 @@ #include "unichar.h" #include "unicharset.h" +using tesseract::UNICHAR; + static const char* const kUnicharsetFileName = "unicharset"; UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) { diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp index 10582f027f..d16e919af8 100644 --- a/training/unicharset_training_utils.cpp +++ b/training/unicharset_training_utils.cpp @@ -22,13 +22,13 @@ #include #include #include +#include #include "fileio.h" -#include "genericvector.h" #include "icuerrorcode.h" #include "normstrngs.h" #include "statistc.h" -#include "strngs.h" +#include "unichar.h" #include "unicharset.h" #include "unicode/uchar.h" // from libicu #include "unicode/uscript.h" // from libicu @@ -50,8 +50,7 @@ void SetupBasicProperties(bool report_errors, bool decompose, } // Convert the unichar to UTF32 representation - GenericVector uni_vector; - tesseract::UTF8ToUTF32(unichar_str, &uni_vector); + std::vector uni_vector = UNICHAR::UTF8ToUTF32(unichar_str); // Assume that if the property is true for any character in the string, // then it holds for the whole "character". @@ -61,17 +60,12 @@ void SetupBasicProperties(bool report_errors, bool decompose, bool unichar_isdigit = false; bool unichar_ispunct = false; - for (int i = 0; i < uni_vector.size(); ++i) { - if (u_isalpha(uni_vector[i])) - unichar_isalpha = true; - if (u_islower(uni_vector[i])) - unichar_islower = true; - if (u_isupper(uni_vector[i])) - unichar_isupper = true; - if (u_isdigit(uni_vector[i])) - unichar_isdigit = true; - if (u_ispunct(uni_vector[i])) - unichar_ispunct = true; + for (char32 u_ch : uni_vector) { + if (u_isalpha(u_ch)) unichar_isalpha = true; + if (u_islower(u_ch)) unichar_islower = true; + if (u_isupper(u_ch)) unichar_isupper = true; + if (u_isdigit(u_ch)) unichar_isdigit = true; + if (u_ispunct(u_ch)) unichar_ispunct = true; } unicharset->set_isalpha(unichar_id, unichar_isalpha); @@ -88,7 +82,7 @@ void SetupBasicProperties(bool report_errors, bool decompose, // Obtain the lower/upper case if needed and record it in the properties. unicharset->set_other_case(unichar_id, unichar_id); if (unichar_islower || unichar_isupper) { - GenericVector other_case(num_code_points, 0); + std::vector other_case(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. // However since they deal with UChars (so need a conversion function @@ -97,8 +91,7 @@ void SetupBasicProperties(bool report_errors, bool decompose, other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]); } - STRING other_case_uch; - tesseract::UTF32ToUTF8(other_case, &other_case_uch); + string other_case_uch = UNICHAR::UTF32ToUTF8(other_case); UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str()); if (other_case_id != INVALID_UNICHAR_ID) { @@ -110,7 +103,7 @@ void SetupBasicProperties(bool report_errors, bool decompose, } // Set RTL property and obtain mirror unichar ID from ICU. - GenericVector mirrors(num_code_points, 0); + std::vector mirrors(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { mirrors[i] = u_charMirror(uni_vector[i]); if (i == 0) { // set directionality to that of the 1st code point @@ -119,8 +112,7 @@ void SetupBasicProperties(bool report_errors, bool decompose, u_charDirection(uni_vector[i]))); } } - STRING mirror_uch; - tesseract::UTF32ToUTF8(mirrors, &mirror_uch); + string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors); UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); if (mirror_uch_id != INVALID_UNICHAR_ID) { unicharset->set_mirror(unichar_id, mirror_uch_id); @@ -130,8 +122,8 @@ void SetupBasicProperties(bool report_errors, bool decompose, } // Record normalized version of this unichar. - STRING normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str); - if (unichar_id != 0 && normed_str.length() > 0) { + string normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str); + if (unichar_id != 0 && !normed_str.empty()) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { unicharset->set_normed(unichar_id, unichar_str); diff --git a/wordrec/language_model.cpp b/wordrec/language_model.cpp index 7075f3d783..17ce78b9df 100644 --- a/wordrec/language_model.cpp +++ b/wordrec/language_model.cpp @@ -43,91 +43,89 @@ const float LanguageModel::kMaxAvgNgramCost = 25.0f; LanguageModel::LanguageModel(const UnicityTable *fontinfo_table, Dict *dict) - : INT_MEMBER(language_model_debug_level, 0, "Language model debug level", - dict->getCCUtil()->params()), - BOOL_INIT_MEMBER(language_model_ngram_on, false, - "Turn on/off the use of character ngram model", - dict->getCCUtil()->params()), - INT_MEMBER(language_model_ngram_order, 8, - "Maximum order of the character ngram model", - dict->getCCUtil()->params()), - INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10, - "Maximum number of prunable (those for which" - " PrunablePath() is true) entries in each viterbi list" - " recorded in BLOB_CHOICEs", - dict->getCCUtil()->params()), - INT_MEMBER(language_model_viterbi_list_max_size, 500, - "Maximum size of viterbi lists recorded in BLOB_CHOICEs", - dict->getCCUtil()->params()), - double_MEMBER(language_model_ngram_small_prob, 0.000001, - "To avoid overly small denominators use this as the " - "floor of the probability returned by the ngram model.", + : INT_MEMBER(language_model_debug_level, 0, "Language model debug level", + dict->getCCUtil()->params()), + BOOL_INIT_MEMBER(language_model_ngram_on, false, + "Turn on/off the use of character ngram model", + dict->getCCUtil()->params()), + INT_MEMBER(language_model_ngram_order, 8, + "Maximum order of the character ngram model", + dict->getCCUtil()->params()), + INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10, + "Maximum number of prunable (those for which" + " PrunablePath() is true) entries in each viterbi list" + " recorded in BLOB_CHOICEs", + dict->getCCUtil()->params()), + INT_MEMBER(language_model_viterbi_list_max_size, 500, + "Maximum size of viterbi lists recorded in BLOB_CHOICEs", + dict->getCCUtil()->params()), + double_MEMBER(language_model_ngram_small_prob, 0.000001, + "To avoid overly small denominators use this as the " + "floor of the probability returned by the ngram model.", + dict->getCCUtil()->params()), + double_MEMBER(language_model_ngram_nonmatch_score, -40.0, + "Average classifier score of a non-matching unichar.", + dict->getCCUtil()->params()), + BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false, + "Use only the first UTF8 step of the given string" + " when computing log probabilities.", dict->getCCUtil()->params()), - double_MEMBER(language_model_ngram_nonmatch_score, -40.0, - "Average classifier score of a non-matching unichar.", - dict->getCCUtil()->params()), - BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false, - "Use only the first UTF8 step of the given string" - " when computing log probabilities.", - dict->getCCUtil()->params()), - double_MEMBER(language_model_ngram_scale_factor, 0.03, - "Strength of the character ngram model relative to the" - " character classifier ", - dict->getCCUtil()->params()), - double_MEMBER(language_model_ngram_rating_factor, 16.0, - "Factor to bring log-probs into the same range as ratings" - " when multiplied by outline length ", - dict->getCCUtil()->params()), - BOOL_MEMBER(language_model_ngram_space_delimited_language, true, - "Words are delimited by space", - dict->getCCUtil()->params()), - INT_MEMBER(language_model_min_compound_length, 3, - "Minimum length of compound words", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1, - "Penalty for words not in the frequent word dictionary", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_non_dict_word, 0.15, - "Penalty for non-dictionary words", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_punc, 0.2, - "Penalty for inconsistent punctuation", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_case, 0.1, - "Penalty for inconsistent case", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_script, 0.5, - "Penalty for inconsistent script", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_chartype, 0.3, - "Penalty for inconsistent character type", - dict->getCCUtil()->params()), - // TODO(daria, rays): enable font consistency checking - // after improving font analysis. - double_MEMBER(language_model_penalty_font, 0.00, - "Penalty for inconsistent font", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_spacing, 0.05, - "Penalty for inconsistent spacing", - dict->getCCUtil()->params()), - double_MEMBER(language_model_penalty_increment, 0.01, - "Penalty increment", - dict->getCCUtil()->params()), - INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations", - dict->getCCUtil()->params()), - BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false, - "Use sigmoidal score for certainty", - dict->getCCUtil()->params()), - dawg_args_(nullptr, new DawgPositionVector(), NO_PERM), - fontinfo_table_(fontinfo_table), dict_(dict), - fixed_pitch_(false), max_char_wh_ratio_(0.0), - acceptable_choice_found_(false) { + double_MEMBER(language_model_ngram_scale_factor, 0.03, + "Strength of the character ngram model relative to the" + " character classifier ", + dict->getCCUtil()->params()), + double_MEMBER(language_model_ngram_rating_factor, 16.0, + "Factor to bring log-probs into the same range as ratings" + " when multiplied by outline length ", + dict->getCCUtil()->params()), + BOOL_MEMBER(language_model_ngram_space_delimited_language, true, + "Words are delimited by space", dict->getCCUtil()->params()), + INT_MEMBER(language_model_min_compound_length, 3, + "Minimum length of compound words", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1, + "Penalty for words not in the frequent word dictionary", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_non_dict_word, 0.15, + "Penalty for non-dictionary words", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_punc, 0.2, + "Penalty for inconsistent punctuation", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_case, 0.1, + "Penalty for inconsistent case", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_script, 0.5, + "Penalty for inconsistent script", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_chartype, 0.3, + "Penalty for inconsistent character type", + dict->getCCUtil()->params()), + // TODO(daria, rays): enable font consistency checking + // after improving font analysis. + double_MEMBER(language_model_penalty_font, 0.00, + "Penalty for inconsistent font", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_spacing, 0.05, + "Penalty for inconsistent spacing", + dict->getCCUtil()->params()), + double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment", + dict->getCCUtil()->params()), + INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations", + dict->getCCUtil()->params()), + BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false, + "Use sigmoidal score for certainty", + dict->getCCUtil()->params()), + dawg_args_(nullptr, new DawgPositionVector(), NO_PERM), + fontinfo_table_(fontinfo_table), + dict_(dict), + fixed_pitch_(false), + max_char_wh_ratio_(0.0), + acceptable_choice_found_(false) { ASSERT_HOST(dict_ != NULL); } -LanguageModel::~LanguageModel() { - delete dawg_args_.updated_dawgs; -} +LanguageModel::~LanguageModel() { delete dawg_args_.updated_dawgs; } void LanguageModel::InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, @@ -797,8 +795,7 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo( // Deal with hyphenated words. if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) { if (language_model_debug_level > 0) tprintf("Hyphenated word found\n"); - return new LanguageModelDawgInfo(dawg_args_.active_dawgs, - COMPOUND_PERM); + return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM); } // Deal with compound words. @@ -811,7 +808,8 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo( // language_model_min_compound_length if (parent_vse == NULL || word_end || dawg_args_.permuter == COMPOUND_PERM || - parent_vse->length < language_model_min_compound_length) return NULL; + parent_vse->length < language_model_min_compound_length) + return NULL; int i; // Check a that the path terminated before the current character is a word. diff --git a/wordrec/lm_pain_points.cpp b/wordrec/lm_pain_points.cpp index 46a878ba1f..12c70ea712 100644 --- a/wordrec/lm_pain_points.cpp +++ b/wordrec/lm_pain_points.cpp @@ -1,5 +1,5 @@ /////////////////////////////////////////////////////////////////////// -// File: lm_pain_points.cpp +// File: pain_points.cpp // Description: Functions that utilize the knowledge about the properties // of the paths explored by the segmentation search in order // to "pain points" - the locations in the ratings matrix diff --git a/wordrec/lm_state.h b/wordrec/lm_state.h index 987b9818b3..9c41fb240b 100644 --- a/wordrec/lm_state.h +++ b/wordrec/lm_state.h @@ -60,7 +60,7 @@ typedef unsigned char LanguageModelFlagsType; /// letters on a path can be found. struct LanguageModelDawgInfo { LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt) - : active_dawgs(*a), permuter(pt) {} + : active_dawgs(*a), permuter(pt) {} DawgPositionVector active_dawgs; PermuterType permuter; }; @@ -125,9 +125,9 @@ struct ViterbiStateEntry : public ELIST_LINK { /// non-increasing order of costs. static int Compare(const void *e1, const void *e2) { const ViterbiStateEntry *ve1 = - *static_cast(e1); + *static_cast(e1); const ViterbiStateEntry *ve2 = - *static_cast(e2); + *static_cast(e2); return (ve1->cost < ve2->cost) ? -1 : 1; } inline bool Consistent() const { diff --git a/wordrec/outlines.cpp b/wordrec/outlines.cpp index f4e6cc17d1..bcbd8adb93 100644 --- a/wordrec/outlines.cpp +++ b/wordrec/outlines.cpp @@ -24,7 +24,7 @@ ******************************************************************************** * Revision 1.2 89/09/15 09:24:41 09:24:41 marks (Mark Seaman) * First released version of Combinatorial splitter code -**/ + **/ /*---------------------------------------------------------------------- I n c l u d e s ----------------------------------------------------------------------*/ diff --git a/wordrec/pieces.cpp b/wordrec/pieces.cpp index 7e5770ce1f..16f7dd2e65 100644 --- a/wordrec/pieces.cpp +++ b/wordrec/pieces.cpp @@ -75,16 +75,16 @@ BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector& seams, template int SortByUnicharID(const void *void1, const void *void2) { - const BLOB_CHOICE *p1 = *static_cast(void1); - const BLOB_CHOICE *p2 = *static_cast(void2); + const BLOB_CHOICE *p1 = *static_cast(void1); + const BLOB_CHOICE *p2 = *static_cast(void2); return p1->unichar_id() - p2->unichar_id(); } template int SortByRating(const void *void1, const void *void2) { - const BLOB_CHOICE *p1 = *static_cast(void1); - const BLOB_CHOICE *p2 = *static_cast(void2); + const BLOB_CHOICE *p1 = *static_cast(void1); + const BLOB_CHOICE *p2 = *static_cast(void2); if (p1->rating() < p2->rating()) return 1;