Skip to content

Commit

Permalink
unittest: Fix and enable pango_font_info_test
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed Jun 28, 2019
1 parent 04d85b4 commit 40c1cf6
Showing 2 changed files with 85 additions and 54 deletions.
17 changes: 11 additions & 6 deletions unittest/Makefile.am
Original file line number Diff line number Diff line change
@@ -132,7 +132,6 @@ check_PROGRAMS += matrix_test
check_PROGRAMS += nthitem_test
check_PROGRAMS += osd_test
# check_PROGRAMS += pagesegmode_test
# check_PROGRAMS += pango_font_info_test
check_PROGRAMS += paragraphs_test
check_PROGRAMS += params_model_test
check_PROGRAMS += progress_test
@@ -159,6 +158,7 @@ check_PROGRAMS += lstm_squashed_test
check_PROGRAMS += lstm_test
check_PROGRAMS += lstmtrainer_test
check_PROGRAMS += normstrngs_test
check_PROGRAMS += pango_font_info_test
check_PROGRAMS += unichar_test
check_PROGRAMS += unicharcompress_test
check_PROGRAMS += unicharset_test
@@ -279,18 +279,23 @@ normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(ICU_I18N
nthitem_test_SOURCES = nthitem_test.cc
nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

#pango_font_info_test_SOURCES = pango_font_info_test.cc
#pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
osd_test_SOURCES = osd_test.cc
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)

pango_font_info_test_SOURCES = pango_font_info_test.cc
pango_font_info_test_SOURCES += third_party/utf/rune.c
pango_font_info_test_SOURCES += util/utf8/unicodetext.cc util/utf8/unilib.cc
pango_font_info_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(LEPTONICA_LIBS)
pango_font_info_test_LDADD += $(ICU_I18N_LIBS) -lfontconfig
pango_font_info_test_LDADD += -lpangocairo-1.0 -lpangoft2-1.0
pango_font_info_test_LDADD += $(cairo_LIBS) $(pango_LIBS)

paragraphs_test_SOURCES = paragraphs_test.cc
paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)

params_model_test_SOURCES = params_model_test.cc
params_model_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

osd_test_SOURCES = osd_test.cc
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)

progress_test_SOURCES = progress_test.cc
progress_test_LDFLAGS = $(OPENCL_LDFLAGS) $(LEPTONICA_LIBS)
progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
122 changes: 74 additions & 48 deletions unittest/pango_font_info_test.cc
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@

#include "tesseract/training/pango_font_info.h"

#include <stdio.h>
#include <string.h>

#include "pango/pango.h"
#include "tesseract/training/commandlineflags.h"
#include "tesseract/training/fileio.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstdio>
#include <string>
#include <pango/pango.h>
#include "include_gunit.h"
#include "commandlineflags.h"
#include "fileio.h"
#include "pango_font_info.h"
#include "absl/strings/str_cat.h" // for absl::StrCat
#include "gmock/gmock-matchers.h" // for EXPECT_THAT
#include "util/utf8/unicodetext.h" // for UnicodeText

DECLARE_STRING_PARAM_FLAG(fonts_dir);
DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
@@ -19,19 +31,19 @@ using tesseract::FontUtils;
using tesseract::PangoFontInfo;

// Fonts in testdata directory
const char* kExpectedFontNames[] = {"Arab",
"Arial Bold Italic",
"DejaVu Sans Ultra-Light",
"Lohit Hindi",
const char* kExpectedFontNames[] = {
"Arab",
"Arial Bold Italic",
"DejaVu Sans Ultra-Light",
"Lohit Hindi",
#if PANGO_VERSION <= 12005
"Times New Roman",
"Times New Roman",
#else
"Times New Roman,", // Pango v1.36.2
// requires a trailing
// ','
"Times New Roman,", // Pango v1.36.2 requires a trailing ','
#endif
"UnBatang",
"Verdana"};
"UnBatang",
"Verdana"
};

// Sample text used in tests.
const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
@@ -41,23 +53,27 @@ const char kKorText[] = "이는 것으로";
// Hindi words containing illegal vowel sequences.
const char* kBadlyFormedHinWords[] = {
#if PANGO_VERSION <= 12005
"उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
"उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
#endif
// Pango v1.36.2 will render the above words even though they are invalid.
"प्रंात", nullptr};
// Pango v1.36.2 will render the above words even though they are invalid.
"प्रंात", nullptr
};

class PangoFontInfoTest : public ::testing::Test {
protected:
void SetUp() override {
std::locale::global(std::locale(""));
static std::locale system_locale("");
std::locale::global(system_locale);
}

// Creates a fake fonts.conf file that points to the testdata fonts for
// fontconfig to initialize with.
static void SetUpTestCase() {
FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
FLAGS_fonts_dir = TESTING_DIR;
FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
#ifdef GOOGLE_TESSERACT
FLAGS_use_only_legacy_fonts = false;
#endif
}

PangoFontInfo font_info_;
@@ -120,7 +136,7 @@ TEST_F(PangoFontInfoTest, CanRenderLigature) {
font_info_.ParseFontDescriptionName("Arab 12");
const char kArabicLigature[] = "لا";
EXPECT_TRUE(
font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));

printf("Next word\n");
EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
@@ -143,17 +159,17 @@ TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
font_info_.ParseFontDescriptionName("Verdana 12");
// Verdana cannot render the "ff" ligature
string word = "office";
std::string word = "office";
EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
EXPECT_EQ("oice", word);

// Don't drop non-letter characters like word joiners.
const char* kJoiners[] = {
"\u2060", // U+2060 (WJ)
"\u200C", // U+200C (ZWJ)
"\u200D" // U+200D (ZWNJ)
"\u2060", // U+2060 (WJ)
"\u200C", // U+200C (ZWJ)
"\u200D" // U+200D (ZWNJ)
};
for (int i = 0; i < ARRAYSIZE(kJoiners); ++i) {
for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) {
word = kJoiners[i];
EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
EXPECT_STREQ(kJoiners[i], word.c_str());
@@ -167,17 +183,21 @@ class FontUtilsTest : public ::testing::Test {
// Creates a fake fonts.conf file that points to the testdata fonts for
// fontconfig to initialize with.
static void SetUpTestCase() {
FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
FLAGS_fonts_dir = TESTING_DIR;
FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
}

void CountUnicodeChars(const char* utf8_text,
std::unordered_map<char32, inT64>* ch_map) {
std::unordered_map<char32, int64_t>* ch_map) {
ch_map->clear();
UnicodeText ut;
ut.PointToUTF8(utf8_text, strlen(utf8_text));
for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
#if 0
if (UnicodeProps::IsWhitespace(*it)) continue;
#else
if (std::isspace(*it)) continue;
#endif
++(*ch_map)[*it];
}
}
@@ -206,21 +226,21 @@ TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
}

TEST_F(FontUtilsTest, DoesListAvailableFonts) {
const std::vector<string>& fonts = FontUtils::ListAvailableFonts();
const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts();
EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
for (int i = 0; i < fonts.size(); ++i) {
for (auto& font : fonts) {
PangoFontInfo font_info;
EXPECT_TRUE(font_info.ParseFontDescriptionName(fonts[i]));
EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
}
}

TEST_F(FontUtilsTest, DoesFindBestFonts) {
string fonts_list;
std::unordered_map<char32, inT64> ch_map;
std::string fonts_list;
std::unordered_map<char32, int64_t> ch_map;
CountUnicodeChars(kEngText, &ch_map);
EXPECT_EQ(26, ch_map.size()); // 26 letters
std::vector<std::pair<const char*, std::vector<bool> > > font_flags;
string best_list = FontUtils::BestFonts(ch_map, &font_flags);
std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
EXPECT_TRUE(best_list.size());
// All fonts except Lohit Hindi should render English text.
EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size());
@@ -238,8 +258,8 @@ TEST_F(FontUtilsTest, DoesSelectFont) {
const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
for (int i = 0; kLangText[i] != nullptr; ++i) {
SCOPED_TRACE(kLangNames[i]);
std::vector<string> graphemes;
string selected_font;
std::vector<std::string> graphemes;
std::string selected_font;
EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
&selected_font, &graphemes));
EXPECT_TRUE(selected_font.size());
@@ -249,28 +269,30 @@ TEST_F(FontUtilsTest, DoesSelectFont) {

TEST_F(FontUtilsTest, DoesFailToSelectFont) {
const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
std::vector<string> graphemes;
string selected_font;
std::vector<std::string> graphemes;
std::string selected_font;
EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
&selected_font, &graphemes));
}

TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
const int32 kHindiChar = 0x0905;
const int32 kArabicChar = 0x0623;
const int32 kMongolianChar = 0x180E; // Mongolian vowel separator
const int32 kOghamChar = 0x1680; // Ogham space mark
const int32_t kHindiChar = 0x0905;
const int32_t kArabicChar = 0x0623;
const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator
const int32_t kOghamChar = 0x1680; // Ogham space mark
std::vector<bool> unicode_mask;
FontUtils::GetAllRenderableCharacters(&unicode_mask);
EXPECT_TRUE(unicode_mask['A']);
EXPECT_TRUE(unicode_mask['1']);
EXPECT_TRUE(unicode_mask[kHindiChar]);
EXPECT_TRUE(unicode_mask[kArabicChar]);
EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian.
#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham.
#endif
unicode_mask.clear();

std::vector<string> selected_fonts;
std::vector<std::string> selected_fonts;
selected_fonts.push_back("Lohit Hindi");
FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
EXPECT_TRUE(unicode_mask['1']);
@@ -279,14 +301,18 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic,
EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian,
EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham.
unicode_mask.clear();

// Check that none of the included fonts cover the Mongolian or Ogham space
// characters.
for (int f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f]));
FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
EXPECT_FALSE(unicode_mask[kOghamChar]);
#endif
EXPECT_FALSE(unicode_mask[kMongolianChar]);
unicode_mask.clear();
}
}
} // namespace

0 comments on commit 40c1cf6

Please sign in to comment.