From 70daecf26700e455b2cc2ec2b422269c2a946666 Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Mon, 27 Aug 2018 21:00:35 +0000 Subject: [PATCH] Javanese Validation works now - for the most part --- src/training/validate_javanese.cpp | 57 +++++++++++++++++------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/src/training/validate_javanese.cpp b/src/training/validate_javanese.cpp index ab36d50ab2..435c362888 100644 --- a/src/training/validate_javanese.cpp +++ b/src/training/validate_javanese.cpp @@ -38,7 +38,7 @@ namespace tesseract { // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* bool ValidateJavanese::ConsumeGraphemeIfValid() { - switch (codes_[codes_used_].first) { + switch (codes_[codes_used_].first) { case CharClass::kConsonant: return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid(); case CharClass::kVowel: @@ -63,25 +63,6 @@ bool ValidateJavanese::ConsumeGraphemeIfValid() { } } -Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { - if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; - if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; - // Offset from the start of the relevant unicode code block aka code page. - int off = ch - static_cast(script_); - // Anything in another code block is other. - if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; - if (off < 0x4) return CharClass::kVowelModifier; - if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels - if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU - if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels - if (off <= 0x39) return CharClass::kMatra; - if (off <= 0x3a) return CharClass::kConsonant; // A9BA TALING - pre base vowel - if (off <= 0x3d) return CharClass::kMatra; - if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants - if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON - return CharClass::kOther; -} - // Helper consumes/copies a virama and any associated post-virama joiners. // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or // no joiner at all) must be followed by a consonant. @@ -117,11 +98,13 @@ bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { if (output_used_ == output_.size() || output_[output_used_] != kCakra) { if (report_errors_) { - tprintf("Virama ZWJ ZWNJ : base=0x%x!\n", + tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast(script_)); } return false; } + // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z] + if (UseMultiCode(4)) return true; } } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant || @@ -130,7 +113,7 @@ bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { codes_[codes_used_].second != kZeroWidthNonJoiner) { // It is valid to have an unterminated virama at the end of a word, but // for consistency, we will always add ZWNJ if not present. - output_.push_back(kZeroWidthNonJoiner); + CodeOnlyToOutput(); } else { CodeOnlyToOutput(); } @@ -164,7 +147,7 @@ bool ValidateJavanese::ConsumeConsonantHeadIfValid() { // Consonant aksara do { CodeOnlyToOutput(); - // Special case of medial consonants [H Z Pengkal/Cakra]. + // Special Sinhala case of [H Z Yayana/Rayana]. int index = output_.size() - 3; if (output_used_ <= index && (output_.back() == kPengkal || output_.back() == kCakra) && @@ -186,7 +169,9 @@ bool ValidateJavanese::ConsumeConsonantHeadIfValid() { } IndicPair joiner(CharClass::kOther, 0); if (codes_used_ < num_codes && - (codes_[codes_used_].second == kZeroWidthJoiner )) { + (codes_[codes_used_].second == kZeroWidthJoiner || + (codes_[codes_used_].second == kZeroWidthNonJoiner && + script_ == ViramaScript::kMalayalam))) { joiner = codes_[codes_used_]; if (++codes_used_ == num_codes) { if (report_errors_) { @@ -236,6 +221,8 @@ bool ValidateJavanese::ConsumeConsonantTailIfValid() { } while (codes_[codes_used_].first == CharClass::kVowelModifier) { if (UseMultiCode(1)) return true; + // Only Malayalam allows only repeated 0xd02. + if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break; } while (codes_[codes_used_].first == CharClass::kVedicMark) { if (UseMultiCode(1)) return true; @@ -256,6 +243,8 @@ bool ValidateJavanese::ConsumeVowelIfValid() { if (UseMultiCode(1)) return true; while (codes_[codes_used_].first == CharClass::kVowelModifier) { if (UseMultiCode(1)) return true; + // Only Malayalam allows repeated modifiers? + if (script_ != ViramaScript::kMalayalam) break; } while (codes_[codes_used_].first == CharClass::kVedicMark) { if (UseMultiCode(1)) return true; @@ -263,5 +252,25 @@ bool ValidateJavanese::ConsumeVowelIfValid() { // What we have consumed so far is a valid vowel cluster. return true; } + + +Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { + if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; + if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; + // Offset from the start of the relevant unicode code block aka code page. + int off = ch - static_cast(script_); + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; + if (off < 0x4) return CharClass::kVowelModifier; + if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels + if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU + if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels + if (off <= 0x39) return CharClass::kMatra; + if (off <= 0x3a) return CharClass::kConsonant; // A9BA TALING - pre base vowel + if (off <= 0x3d) return CharClass::kMatra; + if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants + if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON + return CharClass::kOther; +} } // namespace tesseract