Skip to content

Commit

Permalink
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, …
Browse files Browse the repository at this point in the history
…undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion
  • Loading branch information
theraysmith committed Jul 14, 2017
1 parent f5c18f7 commit da03e4e
Show file tree
Hide file tree
Showing 80 changed files with 1,061 additions and 1,180 deletions.
21 changes: 11 additions & 10 deletions api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
#include <string.h>
#endif // _WIN32

#include <fstream>
#include <iostream>
#include <string>
#include <iterator>
#include <fstream>
#include <memory> // std::unique_ptr
#include <memory> // std::unique_ptr
#include <string>

#include "allheaders.h"

Expand Down Expand Up @@ -1540,7 +1540,8 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
if (bold) hocr_str += "<strong>";
if (italic) hocr_str += "<em>";
do {
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
const std::unique_ptr<const char[]> grapheme(
res_it->GetUTF8Text(RIL_SYMBOL));
if (grapheme && grapheme[0] != 0) {
hocr_str += HOcrEscape(grapheme.get());
}
Expand Down Expand Up @@ -1662,7 +1663,8 @@ char* TessBaseAPI::GetTSVText(int page_number) {
if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;

do {
tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
tsv_str +=
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
tsv_str += "\n"; // end of row
Expand Down Expand Up @@ -1720,16 +1722,16 @@ char* TessBaseAPI::GetBoxText(int page_number) {
do {
int left, top, right, bottom;
if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
const std::unique_ptr</*non-const*/ char[]> text(
it->GetUTF8Text(RIL_SYMBOL));
// Tesseract uses space for recognition failure. Fix to a reject
// character, kTesseractReject so we don't create illegal box files.
for (int i = 0; text[i] != '\0'; ++i) {
if (text[i] == ' ')
text[i] = kTesseractReject;
}
snprintf(result + output_length, total_length - output_length,
"%s %d %d %d %d %d\n",
text.get(), left, image_height_ - bottom,
"%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom,
right, image_height_ - top, page_number);
output_length += strlen(result + output_length);
// Just in case...
Expand Down Expand Up @@ -2063,8 +2065,7 @@ void TessBaseAPI::End() {
delete paragraph_models_;
paragraph_models_ = NULL;
}
if (osd_tesseract_ == tesseract_)
osd_tesseract_ = nullptr;
if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr;
delete tesseract_;
tesseract_ = nullptr;
delete osd_tesseract_;
Expand Down
35 changes: 16 additions & 19 deletions api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include "config_auto.h"
#endif

#include <memory> // std::unique_ptr
#include <memory> // std::unique_ptr
#include "allheaders.h"
#include "baseapi.h"
#include "math.h"
Expand Down Expand Up @@ -457,13 +457,12 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
STRING pdf_word("");
int pdf_word_len = 0;
do {
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
const std::unique_ptr<const char[]> grapheme(
res_it->GetUTF8Text(RIL_SYMBOL));
if (grapheme && grapheme[0] != '\0') {
GenericVector<int> unicodes;
UNICHAR::UTF8ToUnicode(grapheme.get(), &unicodes);
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
char utf16[kMaxBytesPerCodepoint];
for (int i = 0; i < unicodes.length(); i++) {
int code = unicodes[i];
for (char32 code : unicodes) {
if (CodepointToUtf16be(code, utf16)) {
pdf_word += utf16;
pdf_word_len++;
Expand Down Expand Up @@ -566,13 +565,13 @@ bool TessPDFRenderer::BeginDocumentHandler() {

// CIDTOGIDMAP
const int kCIDToGIDMapSize = 2 * (1 << 16);
const std::unique_ptr</*non-const*/ unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
const std::unique_ptr<unsigned char[]> cidtogidmap(
new unsigned char[kCIDToGIDMapSize]);
for (int i = 0; i < kCIDToGIDMapSize; i++) {
cidtogidmap[i] = (i % 2) ? 1 : 0;
}
size_t len;
unsigned char *comp =
zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
n = snprintf(buf, sizeof(buf),
"5 0 obj\n"
"<<\n"
Expand Down Expand Up @@ -665,8 +664,8 @@ bool TessPDFRenderer::BeginDocumentHandler() {
fseek(fp, 0, SEEK_END);
long int size = ftell(fp);
fseek(fp, 0, SEEK_SET);
const std::unique_ptr</*non-const*/ char[]> buffer(new char[size]);
if (fread(buffer.get(), 1, size, fp) != static_cast<unsigned long>(size)) {
const std::unique_ptr<char[]> buffer(new char[size]);
if (fread(buffer.get(), 1, size, fp) != static_cast<size_t>(size)) {
fclose(fp);
return false;
}
Expand Down Expand Up @@ -879,11 +878,11 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
AppendPDFObject(buf);

// CONTENTS
const std::unique_ptr</*non-const*/ char[]> pdftext(GetPDFTextObjects(api, width, height));
const long pdftext_len = strlen(pdftext.get());
const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
const size_t pdftext_len = strlen(pdftext.get());
size_t len;
unsigned char *comp_pdftext =
zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
unsigned char *comp_pdftext = zlibCompress(
reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
long comp_pdftext_len = len;
n = snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
Expand Down Expand Up @@ -960,11 +959,9 @@ bool TessPDFRenderer::EndDocumentHandler() {

// INFO
STRING utf16_title = "FEFF"; // byte_order_marker
GenericVector<int> unicodes;
UNICHAR::UTF8ToUnicode(title(), &unicodes);
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
char utf16[kMaxBytesPerCodepoint];
for (int i = 0; i < unicodes.length(); i++) {
int code = unicodes[i];
for (char32 code : unicodes) {
if (CodepointToUtf16be(code, utf16)) {
utf16_title += utf16;
}
Expand Down
2 changes: 1 addition & 1 deletion api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
#include "config_auto.h"
#endif

#include <memory> // std::unique_ptr
#include <string.h>
#include <memory> // std::unique_ptr
#include "baseapi.h"
#include "genericvector.h"
#include "renderer.h"
Expand Down
38 changes: 19 additions & 19 deletions api/tesseractmain.cpp
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
/**********************************************************************
* File: tesseractmain.cpp (Formerly tessedit.c)
* Description: Main program for merge of tess and editor.
* Author: Ray Smith
* Created: Tue Jan 07 15:21:46 GMT 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
* File: tesseractmain.cpp (Formerly tessedit.c)
* Description: Main program for merge of tess and editor.
* Author: Ray Smith
* Created: Tue Jan 07 15:21:46 GMT 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/

// Include automatically generated configuration file if running autoconf
#ifdef HAVE_CONFIG_H
Expand Down Expand Up @@ -404,7 +404,7 @@ int main(int argc, char** argv) {
static GenericVector<STRING> vars_vec;
static GenericVector<STRING> vars_values;

#ifdef NDEBUG
#if !defined(DEBUG)
// Disable debugging and informational messages from Leptonica.
setMsgSeverity(L_SEVERITY_ERROR);
#endif
Expand All @@ -431,7 +431,7 @@ int main(int argc, char** argv) {
// first TessBaseAPI must be destructed, DawgCache must be the last object.
tesseract::Dict::GlobalDawgCache();

// Avoid memory leak caused by auto variable when exit() is called.
// Avoid memory leak caused by auto variable when return is called.
static tesseract::TessBaseAPI api;

api.SetOutputName(outputbase);
Expand Down
12 changes: 6 additions & 6 deletions ccmain/control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1878,11 +1878,11 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
*
* Find the modal font and remove from the stats.
*/
static void find_modal_font( //good chars in word
STATS *fonts, //font stats
inT16 *font_out, //output font
int8_t *font_count //output count
) {
static void find_modal_font( // good chars in word
STATS* fonts, // font stats
inT16* font_out, // output font
int8_t* font_count // output count
) {
inT16 font; //font index
inT32 count; //pile couat

Expand Down Expand Up @@ -1999,7 +1999,7 @@ void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
}
}
inT16 doc_font; // modal font
int8_t doc_font_count; // modal font
int8_t doc_font_count; // modal font
find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
if (doc_font_count == 0)
return;
Expand Down
6 changes: 3 additions & 3 deletions ccmain/docqual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,9 +511,9 @@ BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
int adjusted_len;
int crunch_mode = 0;

if ((word->best_choice->unichar_string().length () == 0) ||
(strspn (word->best_choice->unichar_string().string(), " ") ==
word->best_choice->unichar_string().unsigned_size ()))
if ((word->best_choice->unichar_string().length() == 0) ||
(strspn(word->best_choice->unichar_string().string(), " ") ==
word->best_choice->unichar_string().unsigned_size()))
crunch_mode = 1;
else {
adjusted_len = word->reject_map.length ();
Expand Down
8 changes: 3 additions & 5 deletions ccmain/equationdetect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,7 @@ EquationDetect::EquationDetect(const char* equ_datapath,
cps_super_bbox_ = NULL;
}

EquationDetect::~EquationDetect() {
delete(cps_super_bbox_);
}
EquationDetect::~EquationDetect() { delete (cps_super_bbox_); }

void EquationDetect::SetLangTesseract(Tesseract* lang_tesseract) {
lang_tesseract_ = lang_tesseract;
Expand Down Expand Up @@ -258,8 +256,8 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(

void EquationDetect::IdentifySpecialText() {
// Set configuration for Tesseract::AdaptiveClassifier.
equ_tesseract_.tess_cn_matching.set_value(true); // turn it on
equ_tesseract_.tess_bn_matching.set_value(false);
equ_tesseract_.tess_cn_matching.set_value(1); // turn it on
equ_tesseract_.tess_bn_matching.set_value(0);

// Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
Expand Down
2 changes: 1 addition & 1 deletion ccmain/paragraphs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#endif

#include <ctype.h>
#include <memory> // std::unique_ptr
#include <memory> // std::unique_ptr

#include "genericvector.h"
#include "helpers.h"
Expand Down
6 changes: 2 additions & 4 deletions ccmain/paramsd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,8 @@ void ParamsEditor::GetPrefixes(const char* s, STRING* level_one,

// Compare two VC objects by their name.
int ParamContent::Compare(const void* v1, const void* v2) {
const ParamContent* one =
*static_cast<const ParamContent* const *>(v1);
const ParamContent* two =
*static_cast<const ParamContent* const *>(v2);
const ParamContent* one = *static_cast<const ParamContent* const*>(v1);
const ParamContent* two = *static_cast<const ParamContent* const*>(v2);
return strcmp(one->GetName(), two->GetName());
}

Expand Down
3 changes: 2 additions & 1 deletion ccmain/pgedit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,8 @@ BOOL8 Tesseract::process_cmd_win_event( // UI command semantics
break;

default:
sprintf(msg, "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
cmd_event, new_value);
image_win->AddMessage(msg);
break;
}
Expand Down
4 changes: 2 additions & 2 deletions ccmain/thresholder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,8 @@ void ImageThresholder::ThresholdRectToPix(Pix* src_pix,
for (int x = 0; x < rect_width_; ++x) {
bool white_result = true;
for (int ch = 0; ch < num_channels; ++ch) {
int pixel = GET_DATA_BYTE(linedata,
(x + rect_left_) * num_channels + ch);
int pixel =
GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
if (hi_values[ch] >= 0 &&
(pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
white_result = false;
Expand Down
2 changes: 1 addition & 1 deletion ccstruct/boxread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
// Validate UTF8 by making unichars with it.
int used = 0;
while (used < uch_len) {
UNICHAR ch(uch + used, uch_len - used);
tesseract::UNICHAR ch(uch + used, uch_len - used);
int new_used = ch.utf8_len();
if (new_used == 0) {
tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n",
Expand Down
20 changes: 4 additions & 16 deletions ccstruct/coutln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -652,22 +652,10 @@ static void ComputeGradient(const l_uint32* data, int wpl,
int x, int y, int width, int height,
ICOORD* gradient) {
const l_uint32* line = data + y * wpl;
int pix_x_y =
x < width && y < height
? GET_DATA_BYTE(line, x)
: 255;
int pix_x_prevy =
x < width && y > 0
? GET_DATA_BYTE(line - wpl, x)
: 255;
int pix_prevx_prevy =
x > 0 && y > 0
? GET_DATA_BYTE(line - wpl, x - 1)
: 255;
int pix_prevx_y =
x > 0 && y < height
? GET_DATA_BYTE(line, x - 1)
: 255;
int pix_x_y = x < width && y < height ? GET_DATA_BYTE(line, x) : 255;
int pix_x_prevy = x < width && y > 0 ? GET_DATA_BYTE(line - wpl, x) : 255;
int pix_prevx_prevy = x > 0 && y > 0 ? GET_DATA_BYTE(line - wpl, x - 1) : 255;
int pix_prevx_y = x > 0 && y < height ? GET_DATA_BYTE(line, x - 1) : 255;
gradient->set_x(pix_x_y + pix_x_prevy - (pix_prevx_y + pix_prevx_prevy));
gradient->set_y(pix_x_prevy + pix_prevx_prevy - (pix_x_y + pix_prevx_y));
}
Expand Down
6 changes: 3 additions & 3 deletions ccstruct/coutln.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**********************************************************************
* File: coutln.h (Formerly: coutline.c)
* Description: Code for the C_OUTLINE class.
* Author: Ray Smith
* File: coutln.h (Formerly:
*coutline.c) Description: Code for the C_OUTLINE class. Author:
*Ray Smith
* Created: Mon Oct 07 16:01:57 BST 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
Expand Down
Loading

0 comments on commit da03e4e

Please sign in to comment.