From 33bb2eaa8d5f1a5b702157ac847a2fc83170daa2 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 12 May 2017 14:52:29 +0200 Subject: [PATCH] Add initial support for traineddata files in zip format This requires libzip-dev or libminizip-dev. Up to now, little endian tesseract works with the new format. More work is needed for training tools and big endian support. Signed-off-by: Stefan Weil --- api/Makefile.am | 7 ++ ccutil/Makefile.am | 8 ++ ccutil/tessdatamanager.cpp | 152 +++++++++++++++++++++++++++++++++++++ ccutil/tessdatamanager.h | 2 + configure.ac | 25 ++++++ training/Makefile.am | 32 ++++++-- 6 files changed, 221 insertions(+), 5 deletions(-) diff --git a/api/Makefile.am b/api/Makefile.am index 7209c45cd3..0883bb7bb2 100644 --- a/api/Makefile.am +++ b/api/Makefile.am @@ -84,6 +84,13 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS) tesseract_LDADD += $(LEPTONICA_LIBS) tesseract_LDADD += $(OPENMP_CXXFLAGS) +if HAVE_LIBZIP +tesseract_LDADD += $(libzip_LIBS) +else +if HAVE_MINIZIP +tesseract_LDADD += $(minizip_LIBS) +endif +endif if T_WIN tesseract_LDADD += -ltiff diff --git a/ccutil/Makefile.am b/ccutil/Makefile.am index 9d3d83b22b..d8e1ffc979 100644 --- a/ccutil/Makefile.am +++ b/ccutil/Makefile.am @@ -41,6 +41,14 @@ libtesseract_ccutil_la_SOURCES = \ unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \ params.cpp universalambigs.cpp +if HAVE_LIBZIP +AM_CPPFLAGS += $(libzip_CFLAGS) +else +if HAVE_MINIZIP +AM_CPPFLAGS += $(minizip_CFLAGS) +endif +endif + if T_WIN AM_CPPFLAGS += -I$(top_srcdir)/vs2010/port -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\" noinst_HEADERS += ../vs2010/port/strtok_r.h diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp index 4fb7f28ad5..6d70342a64 100644 --- a/ccutil/tessdatamanager.cpp +++ b/ccutil/tessdatamanager.cpp @@ -21,9 +21,21 @@ #pragma warning(disable:4244) // Conversion warnings #endif +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + #include "tessdatamanager.h" +#include + #include +#if defined(HAVE_LIBZIP) +#include +#elif defined(HAVE_MINIZIP) +#include +#endif // ZIP supported #include "helpers.h" #include "serialis.h" @@ -33,9 +45,144 @@ namespace tesseract { +#if defined(HAVE_LIBZIP) +bool TessdataManager::LoadZipFile(const char *filename) { + bool result = false; + fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, filename); + std::string zipfile(filename); + int err; + zip_t *uf = zip_open(zipfile.c_str(), ZIP_RDONLY, &err); + if (uf == nullptr) { + zipfile += ".zip"; + uf = zip_open(zipfile.c_str(), ZIP_RDONLY, &err); + } + if (uf != nullptr) { + fprintf(stderr, "zip_open(%s) passed\n", zipfile.c_str()); + int64_t nEntries = zip_get_num_entries(uf, ZIP_FL_UNCHANGED); + for (int i = 0; i < nEntries; i++) { + zip_stat_t zipStat; + if (zip_stat_index(uf, i, ZIP_FL_UNCHANGED, &zipStat) == 0 && + (zipStat.valid & ZIP_STAT_NAME) && (zipStat.valid & ZIP_STAT_SIZE)) { + //~ fprintf(stderr, + //~ "zip_get_name(...) passed, file %s\n", zipStat.name); + + TessdataType type; + if (TessdataTypeFromFileName(zipStat.name, &type)) { + fprintf(stderr, + "TessdataTypeFromFileName(%s, ...) passed, type %d\n", + zipStat.name, type); + zip_file_t *zipFile = zip_fopen_index(uf, i, ZIP_FL_UNCHANGED); + if (zipFile == nullptr) { + fprintf(stderr, "zip_fopen_index(...) failed\n"); + } else { + entries_[type].resize_no_init(zipStat.size); + if (zip_fread(zipFile, &entries_[type][0], zipStat.size) != + static_cast(zipStat.size)) { + fprintf(stderr, "zip_fread(...) failed\n"); + } + zip_fclose(zipFile); + } + } + } + } + is_loaded_ = true; + err = zip_close(uf); + if (err != 0) { + fprintf(stderr, "zip_close(...) failed\n"); + } + result = true; + } else { + fprintf(stderr, "zip_open(%s) failed\n", zipfile.c_str()); + +} + return result; +} +#elif defined(HAVE_MINIZIP) +bool TessdataManager::LoadZipFile(const char *filename) { + bool result = false; + fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, filename); + std::string zipfile(filename); + unzFile uf = unzOpen(zipfile.c_str()); + if (uf == nullptr) { + zipfile += ".zip"; + uf = unzOpen(zipfile.c_str()); + } + if (uf != nullptr) { + fprintf(stderr, "unzOpen(%s) passed\n", zipfile.c_str()); + unz_global_info global_info; + int err; + err = unzGetGlobalInfo(uf, &global_info); + if (err == UNZ_OK) { + fprintf(stderr, + "unzGetGlobalInfo(...) passed, zip file with %lu entries\n", + global_info.number_entry); + } + unz_file_info file_info; + char fileName[32]; + char extraField[32]; + char comment[32]; + //~ $1 = {version = 798, version_needed = 20, flag = 0, compression_method = 8, dosDate = 1252768343, crc = 2481269679, compressed_size = 7131663, uncompressed_size = 16109842, + //~ size_filename = 15, size_file_extra = 24, size_file_comment = 0, disk_num_start = 0, internal_fa = 0, external_fa = 2175008768, tmu_date = {tm_sec = 46, tm_min = 18, + //~ tm_hour = 23, tm_mday = 11, tm_mon = 4, tm_year = 2017}} + for (unsigned i = 0; i < global_info.number_entry; i++) { + err = unzGetCurrentFileInfo(uf, &file_info, + fileName, sizeof(fileName), + extraField, sizeof(extraField), + comment, sizeof(comment)); + if (err == UNZ_OK) { + //~ fprintf(stderr, + //~ "unzGetCurrentFileInfo(...) passed, file %s, %lu byte\n", + //~ fileName, file_info.uncompressed_size); + + TessdataType type; + if (TessdataTypeFromFileName(fileName, &type)) { + fprintf(stderr, + "TessdataTypeFromFileName(%s, ...) passed, type %d\n", + fileName, type); + err = unzOpenCurrentFilePassword(uf, nullptr); + if (err != UNZ_OK) { + fprintf(stderr, "unzOpenCurrentFilePassword(...) failed, err %d\n", err); + } else { + entries_[type].resize_no_init(file_info.uncompressed_size); + err = unzReadCurrentFile(uf, &entries_[type][0], file_info.uncompressed_size); + if (err < UNZ_OK) { + fprintf(stderr, "unzReadCurrentFile(...) failed, err %d\n", err); + } + err = unzCloseCurrentFile(uf); + if (err != UNZ_OK) { + fprintf(stderr, "unzCloseCurrentFile(...) failed\n"); + } + } + } + } + //~ err = unzGoToFirstFile(uf); + + err = unzGoToNextFile(uf); + if (err != UNZ_OK) { + fprintf(stderr, "unzGoToNextFile(...) failed\n"); + } + } + is_loaded_ = true; + err = unzClose(uf); + if (err != UNZ_OK) { + fprintf(stderr, "unzClose(...) failed\n"); + } + result = true; + } else { + fprintf(stderr, "unzOpen(%s) failed\n", zipfile.c_str()); + perror(zipfile.c_str()); + } + return result; +} +#endif // ZIP supported + bool TessdataManager::Init(const char *data_file_name) { + fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, data_file_name); GenericVector data; if (reader_ == nullptr) { +#if defined(HAVE_MINIZIP) + if (LoadZipFile(data_file_name)) return true; +#endif // HAVE_MINIZIP if (!LoadDataFromFile(data_file_name, &data)) return false; } else { if (!(*reader_)(data_file_name, &data)) return false; @@ -46,6 +193,7 @@ bool TessdataManager::Init(const char *data_file_name) { // Loads from the given memory buffer as if a file. bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) { + // TODO: This method supports only the proprietary file format. data_file_name_ = name; TFile fp; fp.Open(data, size); @@ -76,6 +224,7 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data, // Saves to the given filename. bool TessdataManager::SaveFile(const STRING &filename, FileWriter writer) const { + // TODO: This method supports only the proprietary file format. ASSERT_HOST(is_loaded_); GenericVector data; Serialize(&data); @@ -87,6 +236,7 @@ bool TessdataManager::SaveFile(const STRING &filename, // Serializes to the given vector. void TessdataManager::Serialize(GenericVector *data) const { + // TODO: This method supports only the proprietary file format. ASSERT_HOST(is_loaded_); // Compute the offset_table and total size. inT64 offset_table[TESSDATA_NUM_ENTRIES]; @@ -146,6 +296,7 @@ bool TessdataManager::CombineDataFiles( const char *language_data_path_prefix, const char *output_filename) { // Load individual tessdata components from files. + // TODO: This method supports only the proprietary file format. for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { TessdataType type; ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type)); @@ -178,6 +329,7 @@ bool TessdataManager::OverwriteComponents( char **component_filenames, int num_new_components) { // Open the files with the new components. + // TODO: This method supports only the proprietary file format. for (int i = 0; i < num_new_components; ++i) { TessdataType type; if (TessdataTypeFromFileName(component_filenames[i], &type)) { diff --git a/ccutil/tessdatamanager.h b/ccutil/tessdatamanager.h index 6d1e89c922..191465d18b 100644 --- a/ccutil/tessdatamanager.h +++ b/ccutil/tessdatamanager.h @@ -195,6 +195,8 @@ class TessdataManager { private: + bool LoadZipFile(const char *filename); + // Saves to the given filename. bool SaveFile(const STRING &filename, FileWriter writer) const; diff --git a/configure.ac b/configure.ac index ab4e30ae5d..b0b3fe57bc 100644 --- a/configure.ac +++ b/configure.ac @@ -441,6 +441,31 @@ else AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.]) fi +AM_CONDITIONAL([HAVE_LIBARCHIVE], false) +AM_CONDITIONAL([HAVE_ZZIPLIB], false) + +PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false]) +if $have_libarchive; then + AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive]) +fi + +PKG_CHECK_MODULES([libzip], [libzip], [have_libzip=true], [have_libzip=false]) +AM_CONDITIONAL([HAVE_LIBZIP], [$have_libzip]) +if $have_libzip; then + AC_DEFINE([HAVE_LIBZIP], [], [Enable libzip]) +fi + +PKG_CHECK_MODULES([minizip], [minizip], [have_minizip=true], [have_minizip=false]) +AM_CONDITIONAL([HAVE_MINIZIP], [$have_minizip]) +if $have_minizip; then + AC_DEFINE([HAVE_MINIZIP], [], [Enable minizip]) +fi + +PKG_CHECK_MODULES([zziplib], [zziplib], [have_zziplib=true], [have_zziplib=false]) +if $have_zziplib; then + AC_DEFINE([HAVE_ZZIPLIB], [], [Enable zziplib]) +fi + AM_CONDITIONAL([ENABLE_TRAINING], true) # Check location of icu headers diff --git a/training/Makefile.am b/training/Makefile.am index defc550a5a..e091dcd8fd 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -11,8 +11,8 @@ AM_CPPFLAGS += \ EXTRA_DIST = language-specific.sh tesstrain.sh tesstrain_utils.sh -# TODO: training programs can not be linked to shared library created -# with -fvisibility +# TODO: training programs can not be linked to shared library created +# with -fvisibility if VISIBILITY AM_LDFLAGS += -all-static endif @@ -26,9 +26,9 @@ noinst_HEADERS = \ noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la libtesseract_training_la_LIBADD = \ - ../cutil/libtesseract_cutil.la + ../cutil/libtesseract_cutil.la # ../api/libtesseract.la - + libtesseract_training_la_SOURCES = \ boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \ fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \ @@ -374,5 +374,27 @@ mftraining_LDADD += $(LEPTONICA_LIBS) set_unicharset_properties_LDADD += $(LEPTONICA_LIBS) shapeclustering_LDADD += $(LEPTONICA_LIBS) text2image_LDADD += $(LEPTONICA_LIBS) -unicharset_extractor_LDFLAGS += $(LEPTONICA_LIBS) +unicharset_extractor_LDFLAGS += $(LEPTONICA_LIBS) wordlist2dawg_LDADD += $(LEPTONICA_LIBS) + +ambiguous_words_LDADD += $(libzip_LIBS) +classifier_tester_LDADD += $(libzip_LIBS) +cntraining_LDADD += $(libzip_LIBS) +combine_tessdata_LDADD += $(libzip_LIBS) +lstmeval_LDADD += $(libzip_LIBS) +lstmtraining_LDADD += $(libzip_LIBS) +mftraining_LDADD += $(libzip_LIBS) +set_unicharset_properties_LDADD += $(libzip_LIBS) +shapeclustering_LDADD += $(libzip_LIBS) +wordlist2dawg_LDADD += $(libzip_LIBS) + +ambiguous_words_LDADD += $(minizip_LIBS) +classifier_tester_LDADD += $(minizip_LIBS) +cntraining_LDADD += $(minizip_LIBS) +combine_tessdata_LDADD += $(minizip_LIBS) +lstmeval_LDADD += $(minizip_LIBS) +lstmtraining_LDADD += $(minizip_LIBS) +mftraining_LDADD += $(minizip_LIBS) +set_unicharset_properties_LDADD += $(minizip_LIBS) +shapeclustering_LDADD += $(minizip_LIBS) +wordlist2dawg_LDADD += $(minizip_LIBS)