Skip to content

Commit

Permalink
chamge validate javanese similar to indic
Browse files Browse the repository at this point in the history
  • Loading branch information
Shreeshrii committed Aug 4, 2018
1 parent f93f9e8 commit 7957288
Show file tree
Hide file tree
Showing 2 changed files with 231 additions and 66 deletions.
273 changes: 210 additions & 63 deletions src/training/validate_javanese.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**********************************************************************
* File: validate_javanese.cpp
* File: validate_javanese.cpp
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
Expand All @@ -16,7 +16,7 @@
*
**********************************************************************/

#include "validate_javanese.h"
#include "validate_javanese.h"
#include "errcode.h"
#include "tprintf.h"

Expand All @@ -26,73 +26,40 @@ namespace tesseract {
// Taken from unicode standard:
// http://www.unicode.org/charts/PDF/UA980.pdf
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
// The order of components in an orthographic syllable as expressed in BNF is:
// {C F} C {{R}Y} {V{A}} {Z}
// Translated to the codes used by the CharClass enum:
// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// Indic - for reference
// + vowel Grapheme: V[D](v)*
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*

bool ValidateJavanese::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Javanese syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if ( codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
switch (codes_[codes_used_].first) {
case CharClass::kConsonant:
return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
case CharClass::kVowel:
case CharClass::kVedicMark:
return ConsumeVowelIfValid();
case CharClass::kZeroWidthJoiner:
case CharClass::kZeroWidthNonJoiner:
// Apart from within an aksara, joiners are silently dropped.
if (report_errors_)
tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
++codes_used_;
return true;
case CharClass::kOther:
UseMultiCode(1);
return true;
default:
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
codes_[codes_used_].first, codes_[codes_used_].second);
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}

Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
Expand All @@ -106,11 +73,191 @@ Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (off < 0x4) return CharClass::kVowelModifier;
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
if (off <= 0x3d) return CharClass::kMatra;
if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
return CharClass::kOther;
}

// Helper consumes/copies a virama and any associated post-virama joiners.
// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
// no joiner at all) must be followed by a consonant.
// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
// consonant, space, or character from a different script. We clean up the
// representation to make it consistent by adding a ZWNJ if missing from a
// non-linking virama. Returns false with an invalid sequence.
bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
int num_codes = codes_.size();
if (joiner.first == CharClass::kOther) {
CodeOnlyToOutput();
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthJoiner) {
// Post-matra viramas must be explicit, so no joiners allowed here.
if (post_matra) {
if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
return false;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_ - 2].second != kCakra &&
(codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
codes_[codes_used_ + 1].second == kPengkal ||
codes_[codes_used_ + 1].second == kCakra)) {
// This combination will be picked up later.
ASSERT_HOST(!CodeOnlyToOutput());
} else {
// Half-form with optional Nukta.
int len = output_.size() + 1 - output_used_;
if (UseMultiCode(len)) return true;
}
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (output_used_ == output_.size() ||
output_[output_used_] != kCakra) {
if (report_errors_) {
tprintf("Virama ZWJ ZWNJ : base=0x%x!\n",
static_cast<int>(script_));
}
return false;
}
}
} else if (codes_used_ == num_codes ||
codes_[codes_used_].first != CharClass::kConsonant ||
post_matra) {
if (codes_used_ == num_codes ||
codes_[codes_used_].second != kZeroWidthNonJoiner) {
// It is valid to have an unterminated virama at the end of a word, but
// for consistency, we will always add ZWNJ if not present.
output_.push_back(kZeroWidthNonJoiner);
} else {
CodeOnlyToOutput();
}
// Explicit virama [H z]
MultiCodePart(2);
}
} else {
// Pre-virama joiner [{Z|z} H] requests specific conjunct.
if (UseMultiCode(2)) {
if (report_errors_)
tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
return false;
}
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (report_errors_) {
tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
codes_[codes_used_].second);
}
return false;
}
}
// It is good so far as it goes.
return true;
}

// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
const int num_codes = codes_.size();
// Consonant aksara
do {
CodeOnlyToOutput();
// Special case of medial consonants [H Z Pengkal/Cakra].
int index = output_.size() - 3;
if (output_used_ <= index &&
(output_.back() == kPengkal || output_.back() == kCakra) &&
IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
MultiCodePart(3);
}
bool have_nukta = false;
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kNukta) {
have_nukta = true;
CodeOnlyToOutput();
}
// Test for subscript conjunct.
index = output_.size() - 2 - have_nukta;
if (output_used_ <= index && IsSubscriptScript() &&
IsVirama(output_[index])) {
// Output previous virama, consonant + optional nukta.
MultiCodePart(2 + have_nukta);
}
IndicPair joiner(CharClass::kOther, 0);
if (codes_used_ < num_codes &&
(codes_[codes_used_].second == kZeroWidthJoiner ||
(codes_[codes_used_].second == kZeroWidthNonJoiner &&
script_ == ViramaScript::kMalayalam))) {
joiner = codes_[codes_used_];
if (++codes_used_ == num_codes) {
if (report_errors_) {
tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
joiner.second);
}
return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
output_.push_back(joiner.second);
} else {
if (report_errors_) {
tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
output_.back(), joiner.second, codes_[codes_used_].second);
}
joiner = std::make_pair(CharClass::kOther, 0);
}
}
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(joiner, false)) return false;
} else {
break; // No virama, so the run of consonants is over.
}
} while (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kConsonant);
if (output_used_ < output_.size()) MultiCodePart(1);
return true;
}

// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ValidateJavanese::ConsumeConsonantTailIfValid() {
if (codes_used_ == codes_.size()) return true;
// No virama: Finish the grapheme.
// Are multiple matras allowed?
if (codes_[codes_used_].first == CharClass::kMatra) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
}
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
return false;
}
}
// What we have consumed so far is a valid consonant cluster.
if (output_used_ < output_.size()) MultiCodePart(1);

return true;
}

// Helper consumes/copies a vowel and optional modifiers.
bool ValidateJavanese::ConsumeVowelIfValid() {
if (UseMultiCode(1)) return true;
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
// What we have consumed so far is a valid vowel cluster.
return true;
}

} // namespace tesseract

24 changes: 21 additions & 3 deletions src/training/validate_javanese.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**********************************************************************
* File: validate_javanese.h
* File: validate_javanese.h
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
Expand All @@ -21,9 +21,11 @@

#include "validator.h"


namespace tesseract {

// Subclass of Validator that validates and segments Javanese.
// Subclass of Validator that validates and segments Javanese scripts

class ValidateJavanese : public Validator {
public:
ValidateJavanese(ViramaScript script, bool report_errors)
Expand All @@ -37,7 +39,23 @@ class ValidateJavanese : public Validator {
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
Validator::CharClass UnicodeToCharClass(char32 ch) const override;

private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid();

// Some special unicodes used only for Javanese processing.
static const char32 kPengkal = 0xa9be; // Javanese Ya
static const char32 kCakra = 0xa9bf; // Javanese Ra
};

} // namespace tesseract
Expand Down

0 comments on commit 7957288

Please sign in to comment.