Skip to content

Commit

Permalink
unittest: Fix and enable normstrngs_test
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed Jun 16, 2019
1 parent 73e5241 commit bbd3626
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 51 deletions.
5 changes: 4 additions & 1 deletion unittest/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ check_PROGRAMS += loadlang_test
check_PROGRAMS += mastertrainer_test
check_PROGRAMS += matrix_test
# check_PROGRAMS += networkio_test
# check_PROGRAMS += normstrngs_test
check_PROGRAMS += nthitem_test
check_PROGRAMS += osd_test
# check_PROGRAMS += pagesegmode_test
Expand Down Expand Up @@ -158,6 +157,7 @@ check_PROGRAMS += lstm_recode_test
check_PROGRAMS += lstm_squashed_test
check_PROGRAMS += lstm_test
check_PROGRAMS += lstmtrainer_test
check_PROGRAMS += normstrngs_test
check_PROGRAMS += unichar_test
check_PROGRAMS += unicharcompress_test
check_PROGRAMS += unicharset_test
Expand Down Expand Up @@ -271,6 +271,9 @@ mastertrainer_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_
matrix_test_SOURCES = matrix_test.cc
matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

normstrngs_test_SOURCES = normstrngs_test.cc
normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)

nthitem_test_SOURCES = nthitem_test.cc
nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

Expand Down
124 changes: 74 additions & 50 deletions unittest/normstrngs_test.cc
Original file line number Diff line number Diff line change
@@ -1,21 +1,39 @@
#include "tesseract/training/normstrngs.h"

#include "tesseract/ccutil/strngs.h"
#include "tesseract/ccutil/unichar.h"
#include "tesseract/unittest/normstrngs_test.h"
#include "util/utf8/public/unilib.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "absl/strings/str_format.h" // for absl::StrFormat
#include "include_gunit.h"
#include "normstrngs.h"
#include "normstrngs_test.h"
#include "strngs.h"
#include "unichar.h"
#if defined(HAS_UNILIB_H)
#include "unilib.h"
#endif

#include "include_gunit.h"

namespace tesseract {
namespace {

static string EncodeAsUTF8(const char32 ch32) {
#if defined(HAS_UNILIB_H)
static std::string EncodeAsUTF8(const char32 ch32) {
UNICHAR uni_ch(ch32);
return string(uni_ch.utf8(), uni_ch.utf8_len());
return std::string(uni_ch.utf8(), uni_ch.utf8_len());
}
#endif

TEST(NormstrngsTest, BasicText) {
const char* kBasicText = "AbCd Ef";
string result;
std::string result;
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
GraphemeNorm::kNormalize, kBasicText,
&result));
Expand All @@ -24,7 +42,7 @@ TEST(NormstrngsTest, BasicText) {

TEST(NormstrngsTest, LigatureText) {
const char* kTwoByteLigText = "ij"; // U+0133 (ij) -> ij
string result;
std::string result;
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
GraphemeNorm::kNormalize, kTwoByteLigText,
&result));
Expand All @@ -39,7 +57,7 @@ TEST(NormstrngsTest, LigatureText) {

TEST(NormstrngsTest, OcrSpecificNormalization) {
const char* kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (')
string result;
std::string result;
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
GraphemeNorm::kNormalize, kSingleQuoteText,
&result));
Expand Down Expand Up @@ -80,7 +98,7 @@ const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नही
const char* kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"};

TEST(NormstrngsTest, DetectsCorrectText) {
string chars;
std::string chars;
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
GraphemeNorm::kNormalize, kEngText, &chars));
EXPECT_STREQ(kEngText, chars.c_str());
Expand All @@ -96,13 +114,13 @@ TEST(NormstrngsTest, DetectsCorrectText) {
}

TEST(NormstrngsTest, DetectsIncorrectText) {
for (int i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) {
for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) {
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
GraphemeNorm::kNormalize,
kBadlyFormedHinWords[i], nullptr))
<< kBadlyFormedHinWords[i];
}
for (int i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) {
for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) {
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
GraphemeNorm::kNormalize,
kBadlyFormedThaiWords[i], nullptr))
Expand All @@ -111,8 +129,8 @@ TEST(NormstrngsTest, DetectsIncorrectText) {
}

TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
string nonindic = "Here's some latin text.";
string dest;
std::string nonindic = "Here's some latin text.";
std::string dest;
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
GraphemeNorm::kNormalize, nonindic.c_str(),
&dest))
Expand All @@ -121,59 +139,59 @@ TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
}

TEST(NormstrngsTest, NoLonelyJoiners) {
string str = "x\u200d\u0d06\u0d34\u0d02";
std::vector<string> glyphs;
std::string str = "x\u200d\u0d06\u0d34\u0d02";
std::vector<std::string> glyphs;
// Returns true, but the joiner is gone.
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[0], string("x"));
EXPECT_EQ(glyphs[1], string("\u0d06"));
EXPECT_EQ(glyphs[2], string("\u0d34\u0d02"));
EXPECT_EQ(glyphs[0], std::string("x"));
EXPECT_EQ(glyphs[1], std::string("\u0d06"));
EXPECT_EQ(glyphs[2], std::string("\u0d34\u0d02"));
}

TEST(NormstrngsTest, NoLonelyJoinersPlus) {
string str = "\u0d2a\u200d+\u0d2a\u0d4b";
std::vector<string> glyphs;
std::string str = "\u0d2a\u200d+\u0d2a\u0d4b";
std::vector<std::string> glyphs;
// Returns true, but the joiner is gone.
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[0], string("\u0d2a"));
EXPECT_EQ(glyphs[1], string("+"));
EXPECT_EQ(glyphs[2], string("\u0d2a\u0d4b"));
EXPECT_EQ(glyphs[0], std::string("\u0d2a"));
EXPECT_EQ(glyphs[1], std::string("+"));
EXPECT_EQ(glyphs[2], std::string("\u0d2a\u0d4b"));
}

TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {
string str = "\u200d+\u200c\u200d";
std::string str = "\u200d+\u200c\u200d";
// Returns true, but the joiners are gone.
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, string("+"));
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, std::string("+"));
str = "\u200d\u200c\u200d";
// Without the plus, the string is invalid.
string result;
std::string result;
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
GraphemeNorm::kNormalize, str.c_str(),
&result))
<< PrintString32WithUnicodes(result);
}

TEST(NormstrngsTest, JoinersStayInArabic) {
string str = "\u0628\u200c\u0628\u200d\u0628";
std::string str = "\u0628\u200c\u0628\u200d\u0628";
// Returns true, string untouched.
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 5, 5, 2, str);
}

TEST(NormstrngsTest, DigitOK) {
string str = "\u0cea"; // Digit 4.
std::string str = "\u0cea"; // Digit 4.
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
}

TEST(NormstrngsTest, DandaOK) {
string str = "\u0964"; // Single danda.
std::string str = "\u0964"; // Single danda.
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
str = "\u0965"; // Double danda.
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
Expand All @@ -182,7 +200,7 @@ TEST(NormstrngsTest, DandaOK) {
TEST(NormstrngsTest, AllScriptsRegtest) {
// Tests some valid text in a large number of scripts, some of which were
// found to be rejected by an earlier version.
const std::vector<std::pair<string, string>> kScriptText(
const std::vector<std::pair<std::string, std::string>> kScriptText(
{{"Arabic",
" فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن"
"توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة "
Expand Down Expand Up @@ -297,7 +315,7 @@ TEST(NormstrngsTest, AllScriptsRegtest) {
"Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}});

for (const auto& p : kScriptText) {
string normalized;
std::string normalized;
EXPECT_TRUE(tesseract::NormalizeUTF8String(
tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized))
Expand All @@ -313,7 +331,7 @@ TEST(NormstrngsTest, IsWhitespace) {
EXPECT_TRUE(IsWhitespace('\n'));
// U+2000 through U+200A
for (char32 ch = 0x2000; ch <= 0x200A; ++ch) {
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
EXPECT_TRUE(IsWhitespace(ch));
}
// U+3000 is whitespace
Expand Down Expand Up @@ -345,29 +363,33 @@ TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
}

#if defined(HAS_UNILIB_H)
// Test that the method clones the util/utf8/public/unilib definition of
// interchange validity.
TEST(NormstrngsTest, IsInterchangeValid) {
const int32 kMinUnicodeValue = 33;
const int32 kMaxUnicodeValue = 0x10FFFF;
for (int32 ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
const int32_t kMinUnicodeValue = 33;
const int32_t kMaxUnicodeValue = 0x10FFFF;
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));
}
}
#endif

#if defined(HAS_UNILIB_H)
// Test that the method clones the util/utf8/public/unilib definition of
// 7-bit ASCII interchange validity.
TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
const int32 kMinUnicodeValue = 33;
const int32 kMaxUnicodeValue = 0x10FFFF;
for (int32 ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
string str = EncodeAsUTF8(ch);
const int32_t kMinUnicodeValue = 33;
const int32_t kMaxUnicodeValue = 0x10FFFF;
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
std::string str = EncodeAsUTF8(ch);
EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
IsInterchangeValid7BitAscii(ch));
}
}
#endif

// Test that the method clones the util/utf8/public/unilib definition of
// fullwidth-halfwidth .
Expand All @@ -379,16 +401,18 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) {
// U+FFE6 -> U+20A9 (won sign)
EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));

const int32 kMinUnicodeValue = 33;
const int32 kMaxUnicodeValue = 0x10FFFF;
for (int32 ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
#if defined(HAS_UNILIB_H)
const int32_t kMinUnicodeValue = 33;
const int32_t kMaxUnicodeValue = 0x10FFFF;
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
if (!IsValidCodepoint(ch)) continue;
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
string str = EncodeAsUTF8(ch);
const string expected_half_str =
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
std::string str = EncodeAsUTF8(ch);
const std::string expected_half_str =
UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true);
EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch)));
}
#endif
}

} // namespace
Expand Down

0 comments on commit bbd3626

Please sign in to comment.