Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, …

…undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion
tesseract-ocr · Jul 14, 2017 · da03e4e · da03e4e
1 parent f5c18f7
commit da03e4e
Show file tree

Hide file tree

Showing 80 changed files with 1,061 additions and 1,180 deletions.
diff --git a/api/baseapi.cpp b/api/baseapi.cpp
@@ -41,11 +41,11 @@
 #include <string.h>
 #endif  // _WIN32
 
+#include <fstream>
 #include <iostream>
-#include <string>
 #include <iterator>
-#include <fstream>
-#include <memory> // std::unique_ptr
+#include <memory>  // std::unique_ptr
+#include <string>
 
 #include "allheaders.h"
 
@@ -1540,7 +1540,8 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     if (bold) hocr_str += "<strong>";
     if (italic) hocr_str += "<em>";
     do {
-      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
+      const std::unique_ptr<const char[]> grapheme(
+          res_it->GetUTF8Text(RIL_SYMBOL));
       if (grapheme && grapheme[0] != 0) {
         hocr_str += HOcrEscape(grapheme.get());
       }
@@ -1662,7 +1663,8 @@ char* TessBaseAPI::GetTSVText(int page_number) {
     if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
 
     do {
-      tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
+      tsv_str +=
+          std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
       res_it->Next(RIL_SYMBOL);
     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
     tsv_str += "\n";  // end of row
@@ -1720,16 +1722,16 @@ char* TessBaseAPI::GetBoxText(int page_number) {
   do {
     int left, top, right, bottom;
     if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
-      const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
+      const std::unique_ptr</*non-const*/ char[]> text(
+          it->GetUTF8Text(RIL_SYMBOL));
       // Tesseract uses space for recognition failure. Fix to a reject
       // character, kTesseractReject so we don't create illegal box files.
       for (int i = 0; text[i] != '\0'; ++i) {
         if (text[i] == ' ')
           text[i] = kTesseractReject;
       }
       snprintf(result + output_length, total_length - output_length,
-               "%s %d %d %d %d %d\n",
-               text.get(), left, image_height_ - bottom,
+               "%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom,
                right, image_height_ - top, page_number);
       output_length += strlen(result + output_length);
       // Just in case...
@@ -2063,8 +2065,7 @@ void TessBaseAPI::End() {
     delete paragraph_models_;
     paragraph_models_ = NULL;
   }
-  if (osd_tesseract_ == tesseract_)
-    osd_tesseract_ = nullptr;
+  if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr;
   delete tesseract_;
   tesseract_ = nullptr;
   delete osd_tesseract_;

diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp
@@ -20,7 +20,7 @@
 #include "config_auto.h"
 #endif
 
-#include <memory> // std::unique_ptr
+#include <memory>  // std::unique_ptr
 #include "allheaders.h"
 #include "baseapi.h"
 #include "math.h"
@@ -457,13 +457,12 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
     STRING pdf_word("");
     int pdf_word_len = 0;
     do {
-      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
+      const std::unique_ptr<const char[]> grapheme(
+          res_it->GetUTF8Text(RIL_SYMBOL));
       if (grapheme && grapheme[0] != '\0') {
-        GenericVector<int> unicodes;
-        UNICHAR::UTF8ToUnicode(grapheme.get(), &unicodes);
+        std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
         char utf16[kMaxBytesPerCodepoint];
-        for (int i = 0; i < unicodes.length(); i++) {
-          int code = unicodes[i];
+        for (char32 code : unicodes) {
           if (CodepointToUtf16be(code, utf16)) {
             pdf_word += utf16;
             pdf_word_len++;
@@ -566,13 +565,13 @@ bool TessPDFRenderer::BeginDocumentHandler() {
 
   // CIDTOGIDMAP
   const int kCIDToGIDMapSize = 2 * (1 << 16);
-  const std::unique_ptr</*non-const*/ unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
+  const std::unique_ptr<unsigned char[]> cidtogidmap(
+      new unsigned char[kCIDToGIDMapSize]);
   for (int i = 0; i < kCIDToGIDMapSize; i++) {
     cidtogidmap[i] = (i % 2) ? 1 : 0;
   }
   size_t len;
-  unsigned char *comp =
-      zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
+  unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
   n = snprintf(buf, sizeof(buf),
                "5 0 obj\n"
                "<<\n"
@@ -665,8 +664,8 @@ bool TessPDFRenderer::BeginDocumentHandler() {
   fseek(fp, 0, SEEK_END);
   long int size = ftell(fp);
   fseek(fp, 0, SEEK_SET);
-  const std::unique_ptr</*non-const*/ char[]> buffer(new char[size]);
-  if (fread(buffer.get(), 1, size, fp) != static_cast<unsigned long>(size)) {
+  const std::unique_ptr<char[]> buffer(new char[size]);
+  if (fread(buffer.get(), 1, size, fp) != static_cast<size_t>(size)) {
     fclose(fp);
     return false;
   }
@@ -879,11 +878,11 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
   AppendPDFObject(buf);
 
   // CONTENTS
-  const std::unique_ptr</*non-const*/ char[]> pdftext(GetPDFTextObjects(api, width, height));
-  const long pdftext_len = strlen(pdftext.get());
+  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
+  const size_t pdftext_len = strlen(pdftext.get());
   size_t len;
-  unsigned char *comp_pdftext =
-      zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
+  unsigned char *comp_pdftext = zlibCompress(
+      reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
   long comp_pdftext_len = len;
   n = snprintf(buf, sizeof(buf),
                "%ld 0 obj\n"
@@ -960,11 +959,9 @@ bool TessPDFRenderer::EndDocumentHandler() {
 
   // INFO
   STRING utf16_title = "FEFF";  // byte_order_marker
-  GenericVector<int> unicodes;
-  UNICHAR::UTF8ToUnicode(title(), &unicodes);
+  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
   char utf16[kMaxBytesPerCodepoint];
-  for (int i = 0; i < unicodes.length(); i++) {
-    int code = unicodes[i];
+  for (char32 code : unicodes) {
     if (CodepointToUtf16be(code, utf16)) {
       utf16_title += utf16;
     }

diff --git a/api/renderer.cpp b/api/renderer.cpp
@@ -19,8 +19,8 @@
 #include "config_auto.h"
 #endif
 
-#include <memory> // std::unique_ptr
 #include <string.h>
+#include <memory>  // std::unique_ptr
 #include "baseapi.h"
 #include "genericvector.h"
 #include "renderer.h"

diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp
@@ -1,21 +1,21 @@
 /**********************************************************************
-* File:        tesseractmain.cpp  (Formerly tessedit.c)
-* Description: Main program for merge of tess and editor.
-* Author:                  Ray Smith
-* Created:                 Tue Jan 07 15:21:46 GMT 1992
-*
-* (C) Copyright 1992, Hewlett-Packard Ltd.
-** Licensed under the Apache License, Version 2.0 (the "License");
-** you may not use this file except in compliance with the License.
-** You may obtain a copy of the License at
-** http://www.apache.org/licenses/LICENSE-2.0
-** Unless required by applicable law or agreed to in writing, software
-** distributed under the License is distributed on an "AS IS" BASIS,
-** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-** See the License for the specific language governing permissions and
-** limitations under the License.
-*
-**********************************************************************/
+ * File:        tesseractmain.cpp  (Formerly tessedit.c)
+ * Description: Main program for merge of tess and editor.
+ * Author:                  Ray Smith
+ * Created:                 Tue Jan 07 15:21:46 GMT 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
 
 // Include automatically generated configuration file if running autoconf
 #ifdef HAVE_CONFIG_H
@@ -404,7 +404,7 @@ int main(int argc, char** argv) {
   static GenericVector<STRING> vars_vec;
   static GenericVector<STRING> vars_values;
 
-#ifdef NDEBUG
+#if !defined(DEBUG)
   // Disable debugging and informational messages from Leptonica.
   setMsgSeverity(L_SEVERITY_ERROR);
 #endif
@@ -431,7 +431,7 @@ int main(int argc, char** argv) {
   // first TessBaseAPI must be destructed, DawgCache must be the last object.
   tesseract::Dict::GlobalDawgCache();
 
-  // Avoid memory leak caused by auto variable when exit() is called.
+  // Avoid memory leak caused by auto variable when return is called.
   static tesseract::TessBaseAPI api;
 
   api.SetOutputName(outputbase);

diff --git a/ccmain/control.cpp b/ccmain/control.cpp
@@ -1878,11 +1878,11 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
  *
  * Find the modal font and remove from the stats.
  */
-static void find_modal_font(           //good chars in word
-                     STATS *fonts,     //font stats
-                     inT16 *font_out,   //output font
-                     int8_t *font_count  //output count
-                    ) {
+static void find_modal_font(  // good chars in word
+    STATS* fonts,             // font stats
+    inT16* font_out,          // output font
+    int8_t* font_count        // output count
+) {
   inT16 font;                     //font index
   inT32 count;                   //pile couat
 
@@ -1999,7 +1999,7 @@ void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
     }
   }
   inT16 doc_font;                 // modal font
-  int8_t doc_font_count;           // modal font
+  int8_t doc_font_count;          // modal font
   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
   if (doc_font_count == 0)
     return;

diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp
@@ -511,9 +511,9 @@ BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
   int adjusted_len;
   int crunch_mode = 0;
 
-  if ((word->best_choice->unichar_string().length () == 0) ||
-    (strspn (word->best_choice->unichar_string().string(), " ") ==
-    word->best_choice->unichar_string().unsigned_size ()))
+  if ((word->best_choice->unichar_string().length() == 0) ||
+      (strspn(word->best_choice->unichar_string().string(), " ") ==
+       word->best_choice->unichar_string().unsigned_size()))
     crunch_mode = 1;
   else {
     adjusted_len = word->reject_map.length ();

diff --git a/ccmain/equationdetect.cpp b/ccmain/equationdetect.cpp
@@ -116,9 +116,7 @@ EquationDetect::EquationDetect(const char* equ_datapath,
   cps_super_bbox_ = NULL;
 }
 
-EquationDetect::~EquationDetect() {
-  delete(cps_super_bbox_);
-}
+EquationDetect::~EquationDetect() { delete (cps_super_bbox_); }
 
 void EquationDetect::SetLangTesseract(Tesseract* lang_tesseract) {
   lang_tesseract_ = lang_tesseract;
@@ -258,8 +256,8 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(
 
 void EquationDetect::IdentifySpecialText() {
   // Set configuration for Tesseract::AdaptiveClassifier.
-  equ_tesseract_.tess_cn_matching.set_value(true);  // turn it on
-  equ_tesseract_.tess_bn_matching.set_value(false);
+  equ_tesseract_.tess_cn_matching.set_value(1);  // turn it on
+  equ_tesseract_.tess_bn_matching.set_value(0);
 
   // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
   int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;

diff --git a/ccmain/paragraphs.cpp b/ccmain/paragraphs.cpp
@@ -21,7 +21,7 @@
 #endif
 
 #include <ctype.h>
-#include <memory> // std::unique_ptr
+#include <memory>  // std::unique_ptr
 
 #include "genericvector.h"
 #include "helpers.h"

diff --git a/ccmain/paramsd.cpp b/ccmain/paramsd.cpp
@@ -183,10 +183,8 @@ void ParamsEditor::GetPrefixes(const char* s, STRING* level_one,
 
 // Compare two VC objects by their name.
 int ParamContent::Compare(const void* v1, const void* v2) {
-  const ParamContent* one =
-    *static_cast<const ParamContent* const *>(v1);
-  const ParamContent* two =
-    *static_cast<const ParamContent* const *>(v2);
+  const ParamContent* one = *static_cast<const ParamContent* const*>(v1);
+  const ParamContent* two = *static_cast<const ParamContent* const*>(v2);
   return strcmp(one->GetName(), two->GetName());
 }
 

diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp
@@ -544,7 +544,8 @@ BOOL8 Tesseract::process_cmd_win_event(                 // UI command semantics
       break;
 
     default:
-      sprintf(msg, "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
+      snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
+               cmd_event, new_value);
       image_win->AddMessage(msg);
     break;
   }

diff --git a/ccmain/thresholder.cpp b/ccmain/thresholder.cpp
@@ -311,8 +311,8 @@ void ImageThresholder::ThresholdRectToPix(Pix* src_pix,
     for (int x = 0; x < rect_width_; ++x) {
       bool white_result = true;
       for (int ch = 0; ch < num_channels; ++ch) {
-        int pixel = GET_DATA_BYTE(linedata,
-                                  (x + rect_left_) * num_channels + ch);
+        int pixel =
+            GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
         if (hi_values[ch] >= 0 &&
             (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
           white_result = false;

diff --git a/ccstruct/boxread.cpp b/ccstruct/boxread.cpp
@@ -206,7 +206,7 @@ bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
   // Validate UTF8 by making unichars with it.
   int used = 0;
   while (used < uch_len) {
-    UNICHAR ch(uch + used, uch_len - used);
+    tesseract::UNICHAR ch(uch + used, uch_len - used);
     int new_used = ch.utf8_len();
     if (new_used == 0) {
       tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n",

diff --git a/ccstruct/coutln.cpp b/ccstruct/coutln.cpp
@@ -652,22 +652,10 @@ static void ComputeGradient(const l_uint32* data, int wpl,
                             int x, int y, int width, int height,
                             ICOORD* gradient) {
   const l_uint32* line = data + y * wpl;
-  int pix_x_y =
-      x < width && y < height
-          ? GET_DATA_BYTE(line, x)
-          : 255;
-  int pix_x_prevy =
-      x < width && y > 0
-          ? GET_DATA_BYTE(line - wpl, x)
-          : 255;
-  int pix_prevx_prevy =
-      x > 0 && y > 0
-          ? GET_DATA_BYTE(line - wpl, x - 1)
-          : 255;
-  int pix_prevx_y =
-      x > 0 && y < height
-          ? GET_DATA_BYTE(line, x - 1)
-          : 255;
+  int pix_x_y = x < width && y < height ? GET_DATA_BYTE(line, x) : 255;
+  int pix_x_prevy = x < width && y > 0 ? GET_DATA_BYTE(line - wpl, x) : 255;
+  int pix_prevx_prevy = x > 0 && y > 0 ? GET_DATA_BYTE(line - wpl, x - 1) : 255;
+  int pix_prevx_y = x > 0 && y < height ? GET_DATA_BYTE(line, x - 1) : 255;
   gradient->set_x(pix_x_y + pix_x_prevy - (pix_prevx_y + pix_prevx_prevy));
   gradient->set_y(pix_x_prevy + pix_prevx_prevy - (pix_x_y + pix_prevx_y));
 }

diff --git a/ccstruct/coutln.h b/ccstruct/coutln.h
@@ -1,7 +1,7 @@
 /**********************************************************************
- * File:					coutln.h      (Formerly:  coutline.c)
- * Description: Code for the C_OUTLINE class.
- * Author:					Ray Smith
+ * File:					coutln.h      (Formerly:
+ *coutline.c) Description: Code for the C_OUTLINE class. Author:
+ *Ray Smith
  * Created:					Mon Oct 07 16:01:57 BST 1991
  *
  * (C) Copyright 1991, Hewlett-Packard Ltd.