diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index ad835a2fdd..4d1f179c1b 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -1,7 +1,12 @@ /********************************************************************** * File: pageres.cpp (Formerly page_res.c) - * Description: Results classes used by control.c - * Author: Phil Cheatle + * Description: Hierarchy of results classes from PAGE_RES to WERD_RES + * and an iterator class to iterate over the words. + * Main purposes: + * Easy way to iterate over the words without a 3-nested loop. + * Holds data used during word recognition. + * Holds information about alternative spacing paths. + * Author: Phil Cheatle * Created: Tue Sep 22 08:42:49 BST 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. @@ -1478,8 +1483,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() { WERD* real_word = word_res->word; if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) { real_word->set_flag(W_FUZZY_SP, true); - tprintf("Made word fuzzy at:"); - real_word->bounding_box().print(); if (word_res->combination) { // The next word should be the corresponding part of combo, but we have // already stepped past it, so find it by search. @@ -1493,8 +1496,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() { ASSERT_HOST(!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)); real_word->set_flag(W_FUZZY_SP, true); - tprintf("Made part of combo word fuzzy at:"); - real_word->bounding_box().print(); } } } diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp index 01a048334e..032d5fee61 100644 --- a/ccutil/tessdatamanager.cpp +++ b/ccutil/tessdatamanager.cpp @@ -50,7 +50,10 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) { ReverseN(&actual_tessdata_num_entries_, sizeof(actual_tessdata_num_entries_)); } - ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES); + if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) { + // For forward compatability, truncate to the number we can handle. + actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES; + } fread(offset_table_, sizeof(inT64), actual_tessdata_num_entries_, data_file_); if (swap_) { diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp index 340085b28e..b2b1fae98c 100644 --- a/ccutil/unicharset.cpp +++ b/ccutil/unicharset.cpp @@ -215,34 +215,6 @@ int UNICHARSET::step(const char* str) const { if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0; return lengths[0]; } -// As step except constraining the search to unichar-ids that are -// self-normalized. Unlike step, does not encode the whole string, therefore -// should be used on short strings (like those obtained from -// get_normed_unichar.) -int UNICHARSET::normed_step(const char* str) const { - // Find the length of the first matching unicharset member. - int length = ids.minmatch(str); - if (length == 0) - return 0; // Empty string or illegal char. - - while (length <= UNICHAR_LEN) { - if (ids.contains(str, length)) { - int matched_id = unichar_to_id(str, length); - const GenericVector& matched_norms = normed_ids(matched_id); - bool good_start = matched_norms.size() == 1 && - matched_norms[0] == matched_id; - if (str[length] == '\0') { - return good_start ? length : 0; - } - if (normed_step(str + length) > 0) - return length; // This length works! - } else if (str[length] == '\0') { - return 0; // Ran out of string. - } - ++length; - } - return 0; -} // Return whether the given UTF-8 string is encodable with this UNICHARSET. // If not encodable, write the first byte offset which cannot be converted @@ -375,19 +347,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const { // stored in the file, and needs to be set when the UNICHARSET is loaded. void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) { unichars[unichar_id].properties.normed_ids.truncate(0); - int length = unichars[unichar_id].properties.normed.length(); - const char* normed_str = unichars[unichar_id].properties.normed.string(); - int step = 0; - for (int offset = 0; offset < length; offset+= step) { - step = normed_step(normed_str + offset); - if (step == 0) { - unichars[unichar_id].properties.normed_ids.truncate(0); - unichars[unichar_id].properties.normed_ids.push_back(unichar_id); - break; - } - int normed_id = unichar_to_id(normed_str + offset, step); - ASSERT_HOST(normed_id >= 0); - unichars[unichar_id].properties.normed_ids.push_back(normed_id); + if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') { + unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE); + } else if (!encode_string(unichars[unichar_id].properties.normed.string(), + true, &unichars[unichar_id].properties.normed_ids, + NULL, NULL)) { + unichars[unichar_id].properties.normed_ids.truncate(0); + unichars[unichar_id].properties.normed_ids.push_back(unichar_id); } } @@ -1015,6 +981,24 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist, } } +// Returns true if there are any repeated unicodes in the normalized +// text of any unichar-id in the unicharset. +bool UNICHARSET::AnyRepeatedUnicodes() const { + int start_id = 0; + if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT; + for (int id = start_id; id < size_used; ++id) { + // Convert to unicodes. + GenericVector unicodes; + if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) && + unicodes.size() > 1) { + for (int u = 1; u < unicodes.size(); ++u) { + if (unicodes[u - 1] == unicodes[u]) return true; + } + } + } + return false; +} + int UNICHARSET::add_script(const char* script) { for (int i = 0; i < script_table_size_used; ++i) { if (strcmp(script, script_table[i]) == 0) diff --git a/ccutil/unicharset.h b/ccutil/unicharset.h index fcc139cb93..e03544842c 100644 --- a/ccutil/unicharset.h +++ b/ccutil/unicharset.h @@ -190,11 +190,6 @@ class UNICHARSET { // WARNING: this function now encodes the whole string for precision. // Use encode_string in preference to repeatedly calling step. int step(const char* str) const; - // As step except constraining the search to unichar-ids that are - // self-normalized. Unlike step, does not encode the whole string, therefore - // should be used on short strings (like those obtained from - // get_normed_unichar.) - int normed_step(const char* str) const; // Return whether the given UTF-8 string is encodable with this UNICHARSET. // If not encodable, write the first byte offset which cannot be converted @@ -678,6 +673,10 @@ class UNICHARSET { kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; } + // Returns true if there are any repeated unicodes in the normalized + // text of any unichar-id in the unicharset. + bool AnyRepeatedUnicodes() const; + // Return a pointer to the CHAR_FRAGMENT class if the given // unichar id represents a character fragment. const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { @@ -775,6 +774,7 @@ class UNICHARSET { // Returns normalized version of unichar with the given unichar_id. const char *get_normed_unichar(UNICHAR_ID unichar_id) const { + if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " "; return unichars[unichar_id].properties.normed.string(); } // Returns a vector of UNICHAR_IDs that represent the ids of the normalized