diff --git a/ccutil/hashfn.h b/ccutil/hashfn.h index be211b0731..ec96932107 100644 --- a/ccutil/hashfn.h +++ b/ccutil/hashfn.h @@ -20,16 +20,15 @@ #ifndef HASHFN_H #define HASHFN_H -#ifdef USE_STD_NAMESPACE #if (__cplusplus >= 201103L) || defined(_MSC_VER) // Visual Studio #include #include -#define hash_map std::unordered_map -#if (_MSC_VER >= 1500 && _MSC_VER < 1600) // Visual Studio 2008 -using namespace std::tr1; +#if defined(_MSC_VER) && (_MSC_VER >= 1500 && _MSC_VER < 1600) // VS 2008 +#define TessHashMap std::tr1::unordered_map +#define TessHashSet std::tr1::unordered_set #else // _MSC_VER -using std::unordered_map; -using std::unordered_set; +#define TessHashMap std::unordered_map +#define TessHashSet std::unordered_set #include #define SmartPtr std::unique_ptr #define HAVE_UNIQUE_PTR @@ -41,23 +40,14 @@ using std::unordered_set; #include using __gnu_cxx::hash_map; using __gnu_cxx::hash_set; -#define unordered_map hash_map -#define unordered_set hash_set +#define TessHashMap __gnu_cxx::hash_map +#define TessHashSet __gnu_cxx::hash_set #else #include #include +#define TessHashMap hash_map +#define TessHashSet :hash_set #endif // gcc -#elif (__clang__) -#include -#include -#define hash_map std::unordered_map -#define unordered_set std::unordered_set -#else // USE_STD_NAMESPACE -#include -#include -#define unordered_map hash_map -#define unordered_set hash_set -#endif // USE_STD_NAMESPACE #ifndef HAVE_UNIQUE_PTR // Trivial smart ptr. Expand to add features of std::unique_ptr as required. diff --git a/textord/bbgrid.h b/textord/bbgrid.h index cde4b93a6d..066b5bae1e 100644 --- a/textord/bbgrid.h +++ b/textord/bbgrid.h @@ -364,7 +364,7 @@ template class GridSearch { // An iterator over the list at (x_, y_) in the grid_. BBC_C_IT it_; // Set of unique returned elements used when unique_mode_ is true. - unordered_set > returns_; + TessHashSet > returns_; }; // Sort function to sort a BBC by bounding_box().left(). diff --git a/training/ligature_table.h b/training/ligature_table.h index ecae7943dd..83e7dc3c4d 100644 --- a/training/ligature_table.h +++ b/training/ligature_table.h @@ -32,7 +32,7 @@ namespace tesseract { class PangoFontInfo; // defined in pango_font_info.h // Map to substitute strings for ligatures. -typedef hash_map LigHash; +typedef TessHashMap LigHash; class LigatureTable { public: diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp index 6a009f7b85..9a4d44ff2c 100644 --- a/training/pango_font_info.cpp +++ b/training/pango_font_info.cpp @@ -60,15 +60,6 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir"); -BOOL_PARAM_FLAG(fontconfig_refresh_cache, false, - "Does a one-time deletion of cache files from the " - "fontconfig_tmpdir before initializing fontconfig."); -BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true, - "Does a one-time reset of the fontconfig config file to point" - " to fonts_dir before initializing fontconfig. Set to true" - " if fontconfig_refresh_cache is true. Set it to false to use" - " multiple instances in separate processes without having to" - " rescan the fonts_dir, using a previously setup font cache"); #ifndef USE_STD_NAMESPACE #include "ocr/trainingdata/typesetting/legacy_fonts.h" @@ -91,7 +82,8 @@ namespace tesseract { // in pixels. const int kDefaultResolution = 300; -bool PangoFontInfo::fontconfig_initialized_ = false; +string PangoFontInfo::fonts_dir_; +string PangoFontInfo::cache_dir_; PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) { Clear(); @@ -119,6 +111,8 @@ void PangoFontInfo::Clear() { } } +PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); } + string PangoFontInfo::DescriptionName() const { if (!desc_) return ""; char* desc_str = pango_font_description_to_string(desc_); @@ -127,59 +121,62 @@ string PangoFontInfo::DescriptionName() const { return desc_name; } -// Initializes Fontconfig for use by writing a fake fonts.conf file into the -// FLAGS_fontconfigs_tmpdir directory, that points to the supplied -// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable -// to point to this fonts.conf file. If force_clear, the cache is refreshed -// even if it has already been initialized. -void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) { - if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) { - fontconfig_initialized_ = true; - return; - } - if (FLAGS_fontconfig_refresh_cache || force_clear) { - File::DeleteMatchingFiles(File::JoinPath( - FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str()); - } - if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache || - force_clear) { - const int MAX_FONTCONF_FILESIZE = 1024; - char fonts_conf_template[MAX_FONTCONF_FILESIZE]; - snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, - "\n" - "\n" - "\n" - "%s\n" - "%s\n" - "\n" - "", fonts_dir.c_str(), - FLAGS_fontconfig_tmpdir.c_str()); - string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(), - "fonts.conf"); - File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); +// If not already initialized, initializes FontConfig by setting its +// environment variable and creating a fonts.conf file that points to the +// FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir. +/* static */ +void PangoFontInfo::SoftInitFontConfig() { + if (fonts_dir_.empty()) { + HardInitFontConfig(FLAGS_fonts_dir, FLAGS_fontconfig_tmpdir); } +} + +// Re-initializes font config, whether or not already initialized. +// If already initialized, any existing cache is deleted, just to be sure. +/* static */ +void PangoFontInfo::HardInitFontConfig(const string& fonts_dir, + const string& cache_dir) { + if (!cache_dir_.empty()) { + File::DeleteMatchingFiles( + File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str()); + } + const int MAX_FONTCONF_FILESIZE = 1024; + char fonts_conf_template[MAX_FONTCONF_FILESIZE]; + cache_dir_ = cache_dir; + fonts_dir_ = fonts_dir; + snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, + "\n" + "\n" + "\n" + "%s\n" + "%s\n" + "\n" + "", + fonts_dir.c_str(), cache_dir_.c_str()); + string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf"); + File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); #ifdef _WIN32 std::string env("FONTCONFIG_PATH="); - env.append(FLAGS_fontconfig_tmpdir.c_str()); + env.append(cache_dir_.c_str()); putenv(env.c_str()); putenv("LANG=en_US.utf8"); #else - setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true); + setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true); // Fix the locale so that the reported font names are consistent. setenv("LANG", "en_US.utf8", true); #endif // _WIN32 - if (!fontconfig_initialized_ || force_clear) { - if (FcInitReinitialize() != FcTrue) { - tprintf("FcInitiReinitialize failed!!\n"); - } + + if (FcInitReinitialize() != FcTrue) { + tprintf("FcInitiReinitialize failed!!\n"); } - fontconfig_initialized_ = true; FontUtils::ReInit(); + // Clear Pango's font cache too. + pango_cairo_font_map_set_default(NULL); } static void ListFontFamilies(PangoFontFamily*** families, int* n_families) { - PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); + PangoFontInfo::SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); DISABLE_HEAP_LEAK_CHECK; pango_font_map_list_families(font_map, families, n_families); @@ -253,7 +250,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) { // in the font map. Note that if the font is wholly missing, this could // correspond to a completely different font family and face. PangoFont* PangoFontInfo::ToPangoFont() const { - InitFontConfig(false, FLAGS_fonts_dir.c_str()); + SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_cairo_context_set_resolution(context, resolution_); @@ -538,7 +535,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc, query_desc.c_str()); PangoFont* selected_font = NULL; { - PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); + PangoFontInfo::SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_context_set_font_map(context, font_map); @@ -690,9 +687,8 @@ void FontUtils::GetAllRenderableCharacters(const vector& fonts, // Utilities written to be backward compatible with StringRender /* static */ -int FontUtils::FontScore(const unordered_map& ch_map, - const string& fontname, - int* raw_score, +int FontUtils::FontScore(const TessHashMap& ch_map, + const string& fontname, int* raw_score, vector* ch_flags) { PangoFontInfo font_info; if (!font_info.ParseFontDescriptionName(fontname)) { @@ -707,7 +703,7 @@ int FontUtils::FontScore(const unordered_map& ch_map, } *raw_score = 0; int ok_chars = 0; - for (unordered_map::const_iterator it = ch_map.begin(); + for (TessHashMap::const_iterator it = ch_map.begin(); it != ch_map.end(); ++it) { bool covered = (IsWhitespace(it->first) || (pango_coverage_get(coverage, it->first) @@ -725,7 +721,7 @@ int FontUtils::FontScore(const unordered_map& ch_map, /* static */ -string FontUtils::BestFonts(const unordered_map& ch_map, +string FontUtils::BestFonts(const TessHashMap& ch_map, vector > >* fonts) { const double kMinOKFraction = 0.99; // Weighted fraction of characters that must be renderable in a font to make diff --git a/training/pango_font_info.h b/training/pango_font_info.h index f07d712f11..09a43fab14 100644 --- a/training/pango_font_info.h +++ b/training/pango_font_info.h @@ -24,10 +24,16 @@ #include #include +#include "commandlineflags.h" #include "hashfn.h" #include "host.h" -#include "util.h" #include "pango/pango-font.h" +#include "pango/pango.h" +#include "pango/pangocairo.h" +#include "util.h" + +DECLARE_STRING_PARAM_FLAG(fonts_dir); +DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir); typedef signed int char32; @@ -44,6 +50,7 @@ class PangoFontInfo { DECORATIVE, }; PangoFontInfo(); + ~PangoFontInfo(); // Initialize from parsing a font description name, defined as a string of the // format: // "FamilyName [FaceName] [PointSize]" @@ -83,10 +90,14 @@ class PangoFontInfo { bool GetSpacingProperties(const string& utf8_char, int* x_bearing, int* x_advance) const; - // Initializes FontConfig by setting its environment variable and creating - // a fonts.conf file that points to the given fonts_dir. Once initialized, - // it is not re-initialized unless force_clear is true. - static void InitFontConfig(bool force_clear, const string& fonts_dir); + // If not already initialized, initializes FontConfig by setting its + // environment variable and creating a fonts.conf file that points to the + // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir. + static void SoftInitFontConfig(); + // Re-initializes font config, whether or not already initialized. + // If already initialized, any existing cache is deleted, just to be sure. + static void HardInitFontConfig(const string& fonts_dir, + const string& cache_dir); // Accessors string DescriptionName() const; @@ -130,8 +141,14 @@ class PangoFontInfo { int resolution_; // Fontconfig operates through an environment variable, so it intrinsically // cannot be thread-friendly, but you can serialize multiple independent - // font configurations by calling InitFontConfig(true, path). - static bool fontconfig_initialized_; + // font configurations by calling HardInitFontConfig(fonts_dir, cache_dir). + // These hold the last initialized values set by HardInitFontConfig or + // the first call to SoftInitFontConfig. + // Directory to be scanned for font files. + static string fonts_dir_; + // Directory to store the cache of font information. (Can be the same as + // fonts_dir_) + static string cache_dir_; private: PangoFontInfo(const PangoFontInfo&); @@ -185,7 +202,8 @@ class FontUtils { // In the flags vector, each flag is set according to whether the // corresponding character (in order of iterating ch_map) can be rendered. // The return string is a list of the acceptable fonts that were used. - static string BestFonts(const unordered_map& ch_map, + static string BestFonts( + const TessHashMap& ch_map, vector > >* font_flag); // FontScore returns the weighted renderability score of the given @@ -193,7 +211,7 @@ class FontUtils { // is also returned in raw_score. // The values in the bool vector ch_flags correspond to whether the // corresponding character (in order of iterating ch_map) can be rendered. - static int FontScore(const unordered_map& ch_map, + static int FontScore(const TessHashMap& ch_map, const string& fontname, int* raw_score, vector* ch_flags); diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp index 225f5e44a3..e7f9699f18 100644 --- a/training/stringrenderer.cpp +++ b/training/stringrenderer.cpp @@ -108,6 +108,7 @@ StringRenderer::StringRenderer(const string& font_desc, int page_width, underline_start_prob_(0), underline_continuation_prob_(0), underline_style_(PANGO_UNDERLINE_SINGLE), + features_(NULL), drop_uncovered_chars_(true), strip_unrenderable_words_(false), add_ligatures_(false), @@ -120,7 +121,6 @@ StringRenderer::StringRenderer(const string& font_desc, int page_width, box_padding_(0), total_chars_(0), font_index_(0), - features_(NULL), last_offset_(0) { pen_color_[0] = 0.0; pen_color_[1] = 0.0; @@ -347,6 +347,11 @@ void StringRenderer::ClearBoxes() { boxaDestroy(&page_boxes_); } +string StringRenderer::GetBoxesStr() { + BoxChar::PrepareToWrite(&boxchars_); + return BoxChar::GetTesseractBoxStr(page_height_, boxchars_); +} + void StringRenderer::WriteAllBoxes(const string& filename) { BoxChar::PrepareToWrite(&boxchars_); BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_); diff --git a/training/stringrenderer.h b/training/stringrenderer.h index 942b7fddce..f0ba0c0b00 100644 --- a/training/stringrenderer.h +++ b/training/stringrenderer.h @@ -90,7 +90,7 @@ class StringRenderer { void set_underline_style(const PangoUnderline style) { underline_style_ = style; } - void set_features(const char *features) { + void set_features(const char* features) { free(features_); features_ = strdup(features); } @@ -130,12 +130,8 @@ class StringRenderer { const PangoFontInfo& font() const { return font_; } - int h_margin() const { - return h_margin_; - } - int v_margin() const { - return v_margin_; - } + int h_margin() const { return h_margin_; } + int v_margin() const { return v_margin_; } // Get the boxchars of all clusters rendered thus far (or since the last call // to ClearBoxes()). @@ -148,6 +144,9 @@ class StringRenderer { void RotatePageBoxes(float rotation); // Delete all boxes. void ClearBoxes(); + // Returns the boxes in a boxfile string. + string GetBoxesStr(); + // Writes the boxes to a boxfile. void WriteAllBoxes(const string& filename); // Removes space-delimited words from the string that are not renderable by // the current font and returns the count of such words. @@ -189,7 +188,7 @@ class StringRenderer { double underline_start_prob_; double underline_continuation_prob_; PangoUnderline underline_style_; - char *features_; + char* features_; // Text filtering options bool drop_uncovered_chars_; bool strip_unrenderable_words_; @@ -211,7 +210,7 @@ class StringRenderer { Boxa* page_boxes_; // Objects cached for subsequent calls to RenderAllFontsToImage() - hash_map char_map_; // Time-saving char histogram. + TessHashMap char_map_; // Time-saving char histogram. int total_chars_; // Number in the string to be rendered. int font_index_; // Index of next font to use in font list. int last_offset_; // Offset returned from last successful rendering diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index ae6a97b99a..906a20ac4f 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -199,7 +199,7 @@ generate_font_image() { local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}" common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words" - common_args+=" --fontconfig_refresh_config_file=false --leading=${LEADING}" + common_args+=" --leading=${LEADING}" common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}" common_args+=" --outputbase=${outbase}"