Skip to content

Commit

Permalink
Add initial support for traineddata files in standard archive formats
Browse files Browse the repository at this point in the history
This requires libarchive-dev.

Tesseract can now load traineddata files in any of the archive formats
which are supported by libarchive. Example of a zipped BagIt archive:

    $ unzip -l /usr/local/share/tessdata/zip.traineddata
    Archive:  /usr/local/share/tessdata/zip.traineddata
      Length      Date    Time    Name
    ---------  ---------- -----   ----
           55  2019-03-05 15:27   bagit.txt
            0  2019-03-05 15:25   data/
         1557  2019-03-05 15:28   manifest-sha256.txt
      1082890  2019-03-05 15:25   data/eng.word-dawg
      1487588  2019-03-05 15:25   data/eng.lstm
         7477  2019-03-05 15:25   data/eng.unicharset
        63346  2019-03-05 15:25   data/eng.shapetable
       976552  2019-03-05 15:25   data/eng.inttemp
        13408  2019-03-05 15:25   data/eng.normproto
         4322  2019-03-05 15:25   data/eng.punc-dawg
         4738  2019-03-05 15:25   data/eng.lstm-number-dawg
         1410  2019-03-05 15:25   data/eng.freq-dawg
          844  2019-03-05 15:25   data/eng.pffmtable
         6360  2019-03-05 15:25   data/eng.lstm-unicharset
         1012  2019-03-05 15:25   data/eng.lstm-recoder
         1047  2019-03-05 15:25   data/eng.unicharambigs
         4322  2019-03-05 15:25   data/eng.lstm-punc-dawg
     16109842  2019-03-05 15:25   data/eng.bigram-dawg
           80  2019-03-05 15:25   data/eng.version
         6426  2019-03-05 15:25   data/eng.number-dawg
      3694794  2019-03-05 15:25   data/eng.lstm-word-dawg
    ---------                     -------
     23468070                     21 files

`combine_tessdata -d` and `combine_tessdata -u` also work.

The traineddata files in the new format can be generated with
standard tools like zip or tar.

More work is needed for other training tools and big endian support.

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed Mar 5, 2019
1 parent 7fbde96 commit 1c7e006
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 10 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ addons:
sources:
#- ubuntu-toolchain-r-test
packages:
- libarchive-dev
#- g++-6

#matrix:
Expand Down
6 changes: 6 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,12 @@ else
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
fi
PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
if $have_libarchive; then
AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
fi
AM_CONDITIONAL([ENABLE_TRAINING], true)
# Check availability of ICU packages.
Expand Down
1 change: 1 addition & 0 deletions src/api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)

tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
tesseract_LDADD += $(libarchive_LIBS)

if T_WIN
tesseract_LDADD += -ltiff
Expand Down
2 changes: 2 additions & 0 deletions src/ccutil/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
params.cpp universalambigs.cpp

AM_CPPFLAGS += $(libarchive_CFLAGS)

if T_WIN
AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
endif
69 changes: 60 additions & 9 deletions src/ccutil/tessdatamanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// File: tessdatamanager.cpp
// Description: Functions to handle loading/combining tesseract data files.
// Author: Daria Antonova
// Created: Wed Jun 03 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -24,6 +23,12 @@
#include "tessdatamanager.h"

#include <cstdio>
#include <string>

#if defined(HAVE_LIBARCHIVE)
#include <archive.h>
#include <archive_entry.h>
#endif

#include "errcode.h"
#include "helpers.h"
Expand Down Expand Up @@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
data_file_name_ = data_file_name;
}

#if defined(HAVE_LIBARCHIVE)
bool TessdataManager::LoadArchiveFile(const char *filename) {
bool result = false;
archive *a = archive_read_new();
if (a != nullptr) {
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
archive_entry *ae;
while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
const char *component = archive_entry_pathname(ae);
if (component != nullptr) {
TessdataType type;
if (TessdataTypeFromFileName(component, &type)) {
int64_t size = archive_entry_size(ae);
if (size > 0) {
entries_[type].resize_no_init(size);
if (archive_read_data(a, &entries_[type][0], size) == size) {
is_loaded_ = true;
}
}
}
}
}
result = is_loaded_;
#if defined(DEBUG)
} else {
tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
filename, strerror(archive_errno(a)));
#endif
}
archive_read_free(a);
}
return result;
}
#endif

bool TessdataManager::Init(const char *data_file_name) {
GenericVector<char> data;
if (reader_ == nullptr) {
#if defined(HAVE_LIBARCHIVE)
if (LoadArchiveFile(data_file_name)) return true;
#endif
if (!LoadDataFromFile(data_file_name, &data)) return false;
} else {
if (!(*reader_)(data_file_name, &data)) return false;
Expand All @@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) {
// Loads from the given memory buffer as if a file.
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
int size) {
// TODO: This method supports only the proprietary file format.
Clear();
data_file_name_ = name;
TFile fp;
Expand All @@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
GenericVector<int64_t> offset_table;
offset_table.resize_no_init(num_entries);
if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
if (offset_table[i] >= 0) {
int64_t entry_size = size - offset_table[i];
int j = i + 1;
unsigned j = i + 1;
while (j < num_entries && offset_table[j] == -1) ++j;
if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
entries_[i].resize_no_init(entry_size);
Expand All @@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
// Saves to the given filename.
bool TessdataManager::SaveFile(const STRING &filename,
FileWriter writer) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
GenericVector<char> data;
Serialize(&data);
Expand All @@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename,

// Serializes to the given vector.
void TessdataManager::Serialize(GenericVector<char> *data) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
// Compute the offset_table and total size.
int64_t offset_table[TESSDATA_NUM_ENTRIES];
int64_t offset = sizeof(int32_t) + sizeof(offset_table);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (entries_[i].empty()) {
offset_table[i] = -1;
} else {
Expand All @@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
fp.OpenWrite(data);
fp.Serialize(&num_entries);
fp.Serialize(&offset_table[0], countof(offset_table));
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
fp.Serialize(&entries_[i][0], entries_[i].size());
}
Expand All @@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {

// Resets to the initial state, keeping the reader.
void TessdataManager::Clear() {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
entries_[i].clear();
}
is_loaded_ = false;
Expand All @@ -154,7 +202,7 @@ void TessdataManager::Clear() {
void TessdataManager::Directory() const {
tprintf("Version string:%s\n", VersionString().c_str());
int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
entries_[i].size(), offset);
Expand Down Expand Up @@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
// Load individual tessdata components from files.
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
TessdataType type;
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
STRING filename = language_data_path_prefix;
Expand Down Expand Up @@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents(
char **component_filenames,
int num_new_components) {
// Open the files with the new components.
// TODO: This method supports only the proprietary file format.
for (int i = 0; i < num_new_components; ++i) {
TessdataType type;
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
Expand All @@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) {

bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
TessdataType *type) {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
*type = static_cast<TessdataType>(i);
return true;
}
}
#if defined(DEBUG)
tprintf("TessdataManager can't determine which tessdata"
" component is represented by %s\n", suffix);
#endif
return false;
}

Expand Down
6 changes: 5 additions & 1 deletion src/ccutil/tessdatamanager.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ class TessdataManager {
*/
bool ExtractToFile(const char *filename);

private:

// Use libarchive.
bool LoadArchiveFile(const char *filename);

/**
* Fills type with TessdataType of the tessdata component represented by the
* given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
Expand All @@ -230,7 +235,6 @@ class TessdataManager {
static bool TessdataTypeFromFileName(const char *filename,
TessdataType *type);

private:
// Name of file it came from.
STRING data_file_name_;
// Function to load the file when we need it.
Expand Down
20 changes: 20 additions & 0 deletions src/training/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
text2image_LDADD += $(LEPTONICA_LIBS)
unicharset_extractor_LDADD += $(LEPTONICA_LIBS)
wordlist2dawg_LDADD += $(LEPTONICA_LIBS)

extralib = $(libarchive_LIBS)

if !DISABLED_LEGACY_ENGINE
ambiguous_words_LDADD += $(extralib)
classifier_tester_LDADD += $(extralib)
cntraining_LDADD += $(extralib)
mftraining_LDADD += $(extralib)
shapeclustering_LDADD += $(extralib)
endif
combine_lang_model_LDADD += $(extralib)
combine_tessdata_LDADD += $(extralib)
dawg2wordlist_LDADD += $(extralib)
lstmeval_LDADD += $(extralib)
lstmtraining_LDADD += $(extralib)
merge_unicharsets_LDADD += $(extralib)
set_unicharset_properties_LDADD += $(extralib)
text2image_LDADD += $(extralib)
unicharset_extractor_LDADD += $(extralib)
wordlist2dawg_LDADD += $(extralib)

0 comments on commit 1c7e006

Please sign in to comment.