-
Notifications
You must be signed in to change notification settings - Fork 9.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial COmmit to add Aksara Jawa - Javanese script
- Loading branch information
1 parent
e9b4e21
commit 0eb7be1
Showing
7 changed files
with
185 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
/********************************************************************** | ||
* File: validate_javanese.cpp | ||
* Description: Text validator for Javanese Script - aksara jawa. | ||
* Author: Shree Devi Kumar | ||
* Created: August 03, 2018 | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
**********************************************************************/ | ||
|
||
#include "validate_javanese.h" | ||
#include "errcode.h" | ||
#include "tprintf.h" | ||
|
||
namespace tesseract { | ||
|
||
// Returns whether codes matches the pattern for a Javanese Grapheme. | ||
// Taken from unicode standard: | ||
// http://www.unicode.org/charts/PDF/UA980.pdf | ||
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf | ||
// Also the Consonant class here includes independent vowels, as they are | ||
// treated the same anyway. | ||
|
||
bool ValidateJavanese::ConsumeGraphemeIfValid() { | ||
int num_codes = codes_.size(); | ||
if (codes_used_ == num_codes) return false; | ||
if (codes_[codes_used_].first == CharClass::kOther) { | ||
UseMultiCode(1); | ||
return true; | ||
} | ||
if (codes_[codes_used_].first != CharClass::kConsonant) { | ||
if (report_errors_) { | ||
tprintf("Invalid start of Javanese syllable:0x%x\n", | ||
codes_[codes_used_].second); | ||
} | ||
return false; | ||
} | ||
if (UseMultiCode(1)) return true; | ||
if ( codes_[codes_used_].first == CharClass::kNukta) { | ||
if (UseMultiCode(1)) return true; | ||
} | ||
while (codes_used_ + 1 < num_codes && | ||
codes_[codes_used_].first == CharClass::kVirama && | ||
codes_[codes_used_ + 1].first == CharClass::kConsonant) { | ||
ASSERT_HOST(!CodeOnlyToOutput()); | ||
if (UseMultiCode(2)) return true; | ||
if (codes_[codes_used_].first == CharClass::kRobat) { | ||
if (UseMultiCode(1)) return true; | ||
} | ||
} | ||
int num_matra_parts = 0; | ||
if (codes_[codes_used_].second == kZeroWidthJoiner || | ||
codes_[codes_used_].second == kZeroWidthNonJoiner) { | ||
if (CodeOnlyToOutput()) { | ||
if (report_errors_) { | ||
tprintf("Unterminated joiner: 0x%x\n", output_.back()); | ||
} | ||
return false; | ||
} | ||
++num_matra_parts; | ||
} | ||
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its | ||
// own or as an addition to other matras. | ||
if (codes_[codes_used_].first == CharClass::kMatra) { | ||
++num_matra_parts; | ||
if (UseMultiCode(num_matra_parts)) return true; | ||
} else if (num_matra_parts) { | ||
if (report_errors_) { | ||
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", | ||
output_.back(), codes_[codes_used_].second); | ||
} | ||
return false; | ||
} | ||
if (codes_[codes_used_].first == CharClass::kMatraPiece && | ||
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) { | ||
if (UseMultiCode(1)) return true; | ||
} | ||
if (codes_[codes_used_].first == CharClass::kVowelModifier) { | ||
if (UseMultiCode(1)) return true; | ||
} | ||
if (codes_used_ + 1 < num_codes && | ||
codes_[codes_used_].first == CharClass::kVirama && | ||
codes_[codes_used_ + 1].first == CharClass::kConsonant) { | ||
ASSERT_HOST(!CodeOnlyToOutput()); | ||
if (UseMultiCode(2)) return true; | ||
} | ||
return true; | ||
} | ||
|
||
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { | ||
if (IsVedicAccent(ch)) return CharClass::kVedicMark; | ||
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; | ||
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; | ||
// Offset from the start of the relevant unicode code block aka code page. | ||
int off = ch - static_cast<char32>(script_); | ||
// Anything in another code block is other. | ||
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; | ||
if (off < 0x4) return CharClass::kVowelModifier; | ||
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels | ||
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU | ||
if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG | ||
if (off <= 0x3d) return CharClass::kMatra; | ||
if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA | ||
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON | ||
return CharClass::kOther; | ||
} | ||
|
||
} // namespace tesseract |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/********************************************************************** | ||
* File: validate_javanese.h | ||
* Description: Text validator for Javanese Script - aksara jawa. | ||
* Author: Shree Devi Kumar | ||
* Created: August 03, 2018 | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
**********************************************************************/ | ||
|
||
#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ | ||
#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ | ||
|
||
#include "validator.h" | ||
|
||
namespace tesseract { | ||
|
||
// Subclass of Validator that validates and segments Javanese. | ||
class ValidateJavanese : public Validator { | ||
public: | ||
ValidateJavanese(ViramaScript script, bool report_errors) | ||
: Validator(script, report_errors) {} | ||
~ValidateJavanese() {} | ||
|
||
protected: | ||
// Returns whether codes matches the pattern for an Javanese Grapheme. | ||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to | ||
// parts_ and output_. Returns true if a valid Grapheme was consumed, | ||
// otherwise does not increment codes_used_. | ||
bool ConsumeGraphemeIfValid() override; | ||
// Returns the CharClass corresponding to the given Unicode ch. | ||
CharClass UnicodeToCharClass(char32 ch) const override; | ||
}; | ||
|
||
} // namespace tesseract | ||
|
||
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters