From c61f8f2111eae3d4ad3c652029d6883634feb6c6 Mon Sep 17 00:00:00 2001 From: zdenop Date: Wed, 18 Dec 2024 11:04:48 +0100 Subject: [PATCH 1/5] Change datadir to std::filesystem::path and modernize detection of tessdata directory --- src/api/baseapi.cpp | 4 +- src/ccmain/paramsd.cpp | 2 +- src/ccmain/tessedit.cpp | 48 ++++++++--------- src/ccutil/ccutil.cpp | 114 +++++++++++++++++++++------------------- src/ccutil/ccutil.h | 7 +-- 5 files changed, 89 insertions(+), 86 deletions(-) diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index bae30ab8bb..c6409a9a65 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -347,7 +347,7 @@ int TessBaseAPI::Init(const char *data, int data_size, const char *language, Ocr // Update datapath and language requested for the last valid initialization. datapath_ = std::move(datapath); if (datapath_.empty() && !tesseract_->datadir.empty()) { - datapath_ = tesseract_->datadir; + datapath_ = tesseract_->datadir.string(); } language_ = language; @@ -396,7 +396,7 @@ void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector *langs) co void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector *langs) const { langs->clear(); if (tesseract_ != nullptr) { - addAvailableLanguages(tesseract_->datadir, langs); + addAvailableLanguages(tesseract_->datadir.string().c_str(), langs); std::sort(langs->begin(), langs->end()); } } diff --git a/src/ccmain/paramsd.cpp b/src/ccmain/paramsd.cpp index 14f220f8e6..44093c3a9a 100644 --- a/src/ccmain/paramsd.cpp +++ b/src/ccmain/paramsd.cpp @@ -298,7 +298,7 @@ ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) { SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess); std::string paramfile; - paramfile = tess->datadir; + paramfile = tess->datadir.string(); paramfile += VARDIR; // parameters dir paramfile += "edited"; // actual name diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp index c751888359..910f29184b 100644 --- a/src/ccmain/tessedit.cpp +++ b/src/ccmain/tessedit.cpp @@ -43,24 +43,25 @@ namespace tesseract { // Read a "config" file containing a set of variable, value pairs. // Searches the standard places: tessdata/configs, tessdata/tessconfigs // and also accepts a relative or absolute path name. -void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) { - std::string path = datadir; - path += "configs/"; - path += filename; - FILE *fp; - if ((fp = fopen(path.c_str(), "rb")) != nullptr) { - fclose(fp); - } else { - path = datadir; - path += "tessconfigs/"; - path += filename; - if ((fp = fopen(path.c_str(), "rb")) != nullptr) { - fclose(fp); - } else { - path = filename; - } - } - ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params()); +void Tesseract::read_config_file(const char *filename, + SetParamConstraint constraint) { + // Construct potential config file paths + std::vector config_paths = { + datadir / "configs" / filename, + datadir / "tessconfigs" / filename, + std::filesystem::path(filename)}; + + // Use the first existing file or fallback to the last (filename) + auto config_file = std::find_if(config_paths.begin(), config_paths.end(), + [](const std::filesystem::path &path) { + std::error_code ec; + return std::filesystem::exists(path, ec); + }); + const std::filesystem::path &selected_path = + (config_file != config_paths.end()) ? *config_file : config_paths.back(); + + ParamUtils::ReadParamsFile(selected_path.string().c_str(), constraint, + this->params()); } // Returns false if a unicharset file for the specified language was not found @@ -81,14 +82,11 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, bool set_only_non_debug_params, TessdataManager *mgr) { // Set the language data path prefix lang = !language.empty() ? language : "eng"; - language_data_path_prefix = datadir; - language_data_path_prefix += lang; - language_data_path_prefix += "."; + std::filesystem::path tessdata_path = datadir / (lang + "." + kTrainedDataSuffix); // Initialize TessdataManager. - std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix; - if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) { - tprintf("Error opening data file %s\n", tessdata_path.c_str()); + if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string().c_str())) { + tprintf("Error opening data file %s\n", tessdata_path.string().c_str()); tprintf( "Please make sure the TESSDATA_PREFIX environment variable is set" " to your \"tessdata\" directory.\n"); @@ -187,7 +185,7 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, tprintf( "Error: Tesseract (legacy) engine requested, but components are " "not present in %s!!\n", - tessdata_path.c_str()); + tessdata_path.string().c_str()); return false; } #endif // ndef DISABLED_LEGACY_ENGINE diff --git a/src/ccutil/ccutil.cpp b/src/ccutil/ccutil.cpp index 930aa2636e..2e3af5f714 100644 --- a/src/ccutil/ccutil.cpp +++ b/src/ccutil/ccutil.cpp @@ -14,8 +14,6 @@ #include "tprintf.h" // for tprintf #include -#include // for std::strrchrA -#include // for std::filesystem namespace tesseract { @@ -33,68 +31,74 @@ CCUtil::CCUtil() CCUtil::~CCUtil() = default; /** - * @brief CCUtil::main_setup - set location of tessdata and name of image + * @brief Finds the path to the tessdata directory. * - * @param argv0 - paths to the directory with language files and config files. - * An actual value of argv0 is used if not nullptr, otherwise TESSDATA_PREFIX is - * used if not nullptr, next try to use compiled in -DTESSDATA_PREFIX. If - * previous is not successful - use current directory. - * @param basename - name of image + * This function determines the location of the tessdata directory based on the + * following order of precedence: + * 1. If `argv0` is provided, use it. + * 2. If `TESSDATA_PREFIX` environment variable is set and the path exists, use + * it. + * 3. On Windows, check for a "tessdata" directory in the executable's directory + * and use it. + * 4. If `TESSDATA_PREFIX` is defined at compile time, use it. + * 5. Otherwise, use the current working directory. + * + * @param argv0 argument to be considered as the data directory path. + * @return The path to the tessdata directory or current directory. */ -void CCUtil::main_setup(const std::string &argv0, const std::string &basename) { - imagebasename = basename; /**< name of image */ - - const char *tessdata_prefix = getenv("TESSDATA_PREFIX"); - - // Ignore TESSDATA_PREFIX if there is no matching filesystem entry. - if (tessdata_prefix != nullptr && !std::filesystem::exists(tessdata_prefix)) { - tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignore it\n", tessdata_prefix); - tessdata_prefix = nullptr; - } - +std::filesystem::path find_data_path(const std::string &argv0) { + // If argv0 is set, always use it even if it is not a valid directory if (!argv0.empty()) { - /* Use tessdata prefix from the command line. */ - datadir = argv0; - } else if (tessdata_prefix) { - /* Use tessdata prefix from the environment. */ - datadir = tessdata_prefix; -#if defined(_WIN32) - } else if (datadir.empty() || !std::filesystem::exists(datadir)) { - /* Look for tessdata in directory of executable. */ - char path[_MAX_PATH]; - DWORD length = GetModuleFileName(nullptr, path, sizeof(path)); - if (length > 0 && length < sizeof(path)) { - char *separator = std::strrchr(path, '\\'); - if (separator != nullptr) { - *separator = '\0'; - std::string subdir = path; - subdir += "/tessdata"; - if (std::filesystem::exists(subdir)) { - datadir = subdir; - } - } + std::filesystem::path path(argv0); + if (!std::filesystem::exists(path) || + !std::filesystem::is_directory(path)) { + tprintf("Warning (tessdata): '%s' is not a valid directory.\n", + argv0.c_str()); } -#endif /* _WIN32 */ + return path; } - // datadir may still be empty: - if (datadir.empty()) { -#if defined(TESSDATA_PREFIX) - // Use tessdata prefix which was compiled in. - datadir = TESSDATA_PREFIX "/tessdata/"; - // Note that some software (for example conda) patches TESSDATA_PREFIX - // in the binary, so it might be shorter. Recalculate its length. - datadir.resize(std::strlen(datadir.c_str())); -#else - datadir = "./"; -#endif /* TESSDATA_PREFIX */ + // Check environment variable if argv0 is not specified + if (const char *tessdata_prefix = std::getenv("TESSDATA_PREFIX")) { + std::filesystem::path path(tessdata_prefix); + if (std::filesystem::exists(path)) { + return path; + } else { + tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignoring.\n", + tessdata_prefix); + } } - // check for missing directory separator - const char lastchar = datadir.back(); - if (lastchar != '/' && lastchar != '\\') { - datadir += '/'; +#ifdef _WIN32 + // Windows-specific: check for 'tessdata' not existing in the executable + // directory + wchar_t path[MAX_PATH]; + if (DWORD length = GetModuleFileNameW(nullptr, path, MAX_PATH); + length > 0 && length < MAX_PATH) { + std::filesystem::path exe_path(path); + auto tessdata_subdir = exe_path.parent_path() / "tessdata"; + if (std::filesystem::exists(tessdata_subdir)) { + return tessdata_subdir; + } } +#endif + + // Fallback to compile-time or current directory +#ifdef TESSDATA_PREFIX + return std::filesystem::path(TESSDATA_PREFIX) / "tessdata"; +#else + return std::filesystem::current_path(); +#endif } + +/** + * @brief CCUtil::main_setup - set location of tessdata and name of image + * + * @param argv0 - paths to the directory with language files and config files. + */ +void CCUtil::main_setup(const std::string &argv0, const std::string &basename) { + imagebasename = basename; /**< name of image */ + datadir = find_data_path(argv0); +} } // namespace tesseract diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h index e64199315f..2ffd18fec2 100644 --- a/src/ccutil/ccutil.h +++ b/src/ccutil/ccutil.h @@ -19,6 +19,8 @@ #ifndef TESSERACT_CCUTIL_CCUTIL_H_ #define TESSERACT_CCUTIL_CCUTIL_H_ +#include // for std::filesystem + #ifndef _WIN32 # include # include @@ -53,9 +55,8 @@ class TESS_API CCUtil { ParamsVectors *params() { return ¶ms_; } - - std::string datadir; // dir for data files - std::string imagebasename; // name of image + std::filesystem::path datadir; // dir for data files + std::string imagebasename; // name of image std::string lang; std::string language_data_path_prefix; UNICHARSET unicharset; From 9251830af775e23c038c83b7432be549ff487a09 Mon Sep 17 00:00:00 2001 From: zdenop Date: Wed, 18 Dec 2024 11:06:33 +0100 Subject: [PATCH 2/5] Solve empty return for 'api.GetDatapath()' / (`The pointer is dangling because it points at a temporary instance that was destroyed.`) --- src/api/baseapi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index c6409a9a65..9d8fc3958f 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -858,7 +858,7 @@ const char *TessBaseAPI::GetInputName() { } const char *TessBaseAPI::GetDatapath() { - return tesseract_->datadir.c_str(); + return datapath_.c_str(); } int TessBaseAPI::GetSourceYResolution() { From ddc32a42d0fa765ab62f3af600c14bac6c78447b Mon Sep 17 00:00:00 2001 From: zdenop Date: Wed, 18 Dec 2024 11:06:53 +0100 Subject: [PATCH 3/5] check if path exists and is directory. Fixes #4364 --- src/api/baseapi.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 9d8fc3958f..89fb38e724 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -146,6 +146,10 @@ static void ExtractFontName(const char* filename, std::string* fontname) { */ static void addAvailableLanguages(const std::string &datadir, std::vector *langs) { + if (!std::filesystem::exists(datadir) || + !std::filesystem::is_directory(datadir)) { + return; + } for (const auto& entry : std::filesystem::recursive_directory_iterator(datadir, std::filesystem::directory_options::follow_directory_symlink | From 1ea0ef29a43cdf9efdcb0b14acf5c2412f2a1e91 Mon Sep 17 00:00:00 2001 From: zdenop Date: Thu, 19 Dec 2024 07:48:56 +0100 Subject: [PATCH 4/5] simplify datadir tests and use stream output --- src/api/baseapi.cpp | 7 +++---- src/ccmain/tessedit.cpp | 5 ++--- src/ccutil/ccutil.cpp | 5 ++--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 89fb38e724..7db0c612d7 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -146,10 +146,9 @@ static void ExtractFontName(const char* filename, std::string* fontname) { */ static void addAvailableLanguages(const std::string &datadir, std::vector *langs) { - if (!std::filesystem::exists(datadir) || - !std::filesystem::is_directory(datadir)) { + if (!std::filesystem::is_directory(datadir)) return; - } + for (const auto& entry : std::filesystem::recursive_directory_iterator(datadir, std::filesystem::directory_options::follow_directory_symlink | @@ -400,7 +399,7 @@ void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector *langs) co void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector *langs) const { langs->clear(); if (tesseract_ != nullptr) { - addAvailableLanguages(tesseract_->datadir.string().c_str(), langs); + addAvailableLanguages(tesseract_->datadir.string(), langs); std::sort(langs->begin(), langs->end()); } } diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp index 910f29184b..8577380e10 100644 --- a/src/ccmain/tessedit.cpp +++ b/src/ccmain/tessedit.cpp @@ -86,7 +86,7 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, // Initialize TessdataManager. if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string().c_str())) { - tprintf("Error opening data file %s\n", tessdata_path.string().c_str()); + tprintf("Error opening data file %s\n", tessdata_path.string()); tprintf( "Please make sure the TESSDATA_PREFIX environment variable is set" " to your \"tessdata\" directory.\n"); @@ -184,8 +184,7 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) { tprintf( "Error: Tesseract (legacy) engine requested, but components are " - "not present in %s!!\n", - tessdata_path.string().c_str()); + "not present in %s!!\n", tessdata_path.string()); return false; } #endif // ndef DISABLED_LEGACY_ENGINE diff --git a/src/ccutil/ccutil.cpp b/src/ccutil/ccutil.cpp index 2e3af5f714..e67a3d9086 100644 --- a/src/ccutil/ccutil.cpp +++ b/src/ccutil/ccutil.cpp @@ -50,10 +50,9 @@ std::filesystem::path find_data_path(const std::string &argv0) { // If argv0 is set, always use it even if it is not a valid directory if (!argv0.empty()) { std::filesystem::path path(argv0); - if (!std::filesystem::exists(path) || - !std::filesystem::is_directory(path)) { + if (!std::filesystem::is_directory(path)) { tprintf("Warning (tessdata): '%s' is not a valid directory.\n", - argv0.c_str()); + argv0); } return path; } From bb9604fbf0a44fcc6a0dd47882fe190ab0d305ad Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Thu, 19 Dec 2024 13:29:13 +0100 Subject: [PATCH 5/5] Fix usage of tesserr stream Signed-off-by: Stefan Weil --- src/ccmain/tessedit.cpp | 11 +++++------ src/ccutil/ccutil.cpp | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp index 8577380e10..73c4e83514 100644 --- a/src/ccmain/tessedit.cpp +++ b/src/ccmain/tessedit.cpp @@ -29,6 +29,7 @@ #include "params.h" #include "stopper.h" #include "tesseractclass.h" +#include "tesserrstream.h" // for tesserr #include "tessvars.h" #include "tprintf.h" #ifndef DISABLED_LEGACY_ENGINE @@ -86,10 +87,9 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, // Initialize TessdataManager. if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string().c_str())) { - tprintf("Error opening data file %s\n", tessdata_path.string()); - tprintf( + tesserr << "Error opening data file " << tessdata_path.string() << '\n' << "Please make sure the TESSDATA_PREFIX environment variable is set" - " to your \"tessdata\" directory.\n"); + " to your \"tessdata\" directory.\n"; return false; } #ifdef DISABLED_LEGACY_ENGINE @@ -182,9 +182,8 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, } #ifndef DISABLED_LEGACY_ENGINE else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) { - tprintf( - "Error: Tesseract (legacy) engine requested, but components are " - "not present in %s!!\n", tessdata_path.string()); + tesserr << "Error: Tesseract (legacy) engine requested, but components are " + "not present in " << tessdata_path.string() << "!!\n"; return false; } #endif // ndef DISABLED_LEGACY_ENGINE diff --git a/src/ccutil/ccutil.cpp b/src/ccutil/ccutil.cpp index e67a3d9086..5e4f2a8166 100644 --- a/src/ccutil/ccutil.cpp +++ b/src/ccutil/ccutil.cpp @@ -11,6 +11,7 @@ // limitations under the License. #include "ccutil.h" +#include "tesserrstream.h" // for tesserr #include "tprintf.h" // for tprintf #include @@ -51,8 +52,7 @@ std::filesystem::path find_data_path(const std::string &argv0) { if (!argv0.empty()) { std::filesystem::path path(argv0); if (!std::filesystem::is_directory(path)) { - tprintf("Warning (tessdata): '%s' is not a valid directory.\n", - argv0); + tesserr << "Warning (tessdata): '" << argv0 << "' is not a valid directory.\n"; } return path; }