Skip to content

Commit

Permalink
[MINOR] Added Tibetan language (#67)
Browse files Browse the repository at this point in the history
  • Loading branch information
azagniotov authored Feb 26, 2025
1 parent 51a7a76 commit 202ce83
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 10 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ This is a refined and re-implemented version of the archived plugin for ElasticS

The library leverages an n-gram probabilistic model, utilizing n-grams of sizes ranging from 1 to 3, alongside a Bayesian filter that incorporates various normalization techniques and feature sampling methods.

The precision is over **99%** for **69** languages. See the following PR description to read about the benchmaks done by @yanirs : https://github.com/jprante/elasticsearch-langdetect/pull/69
The precision is over **99%** for **70** languages. See the following PR description to read about the benchmaks done by @yanirs : https://github.com/jprante/elasticsearch-langdetect/pull/69

### Enhancements over past implementations

Expand Down Expand Up @@ -121,6 +121,7 @@ The following is a list of ISO 639-1 languages code supported by the library:
| Tamil | ta |
| Telugu | te |
| Thai | th |
| Tibetan | bo |
| Tigrinya | ti |
| Turkish | tr |
| Ukrainian | uk |
Expand Down
2 changes: 1 addition & 1 deletion latest-version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
5.0.0
5.1.0
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
public class LanguageDetectionSettings {

private static final String ALL_SUPPORTED_ISO_CODES_639_1 =
"af,am,ar,az,bg,bn,br,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fr,ga,gu,he,hi,hr,hu,hy,id,it,ja,ka,kk,kn,ko,ky,lb,lt,lv,mk,ml,mn,mr,ne,nl,no,pa,pl,pt,"
"af,am,ar,az,bg,bn,bo,br,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fr,ga,gu,he,hi,hr,hu,hy,id,it,ja,ka,kk,kn,ko,ky,lb,lt,lv,mk,ml,mn,mr,ne,nl,no,pa,pl,pt,"
+ "ro,ru,si,sk,sl,so,sq,sv,sw,ta,te,th,ti,tl,tr,uk,ur,vi,yi,zh-cn,zh-tw";

static final LanguageDetectionSettings DEFAULT_SETTINGS_ALL_LANGUAGES =
Expand Down
1 change: 1 addition & 0 deletions src/main/resources/langdetect/merged-average/bo

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ public void shouldDetectJapaneseDataset() throws Exception {
public void languageDetectorShortStrings() throws Exception {
final LanguageDetectionSettings supportedLanguages =
LanguageDetectionSettings.fromIsoCodes639_1(
"az,am,br,cy,de,eu,ga,he,hy,ka,kk,ky,lb,mn,ti,yi")
"az,am,bo,br,cy,de,eu,ga,he,hy,ka,kk,ky,lb,mn,ti,yi")
.build();
final LanguageDetectorFactory factory = new LanguageDetectorFactory(supportedLanguages);
final LanguageDetector detector =
Expand Down Expand Up @@ -155,6 +155,8 @@ public void languageDetectorShortStrings() throws Exception {
assertEquals("lb", detector.detectAll("Ech léiere Lëtzebuergesch").get(0).getIsoCode639_1());
// Mongolian
assertEquals("mn", detector.detectAll("Би монгол хэл сурч байна").get(0).getIsoCode639_1());
// Tibetan
assertEquals("bo", detector.detectAll("ངས་བོད་ཡིག་སྦྱོང་གི་ཡོད།").get(0).getIsoCode639_1());
// Tigrinya
assertEquals("ti", detector.detectAll("ትግርኛ ይመሃር ኣለኹ").get(0).getIsoCode639_1());
// Welsh
Expand Down Expand Up @@ -243,6 +245,11 @@ public void testMongolian() throws Exception {
testLanguage("mongolian.txt", "mn", DEFAULT_DETECTOR);
}

@Test
public void testTibetan() throws Exception {
testLanguage("tibetan.txt", "bo", DEFAULT_DETECTOR);
}

@Test
public void testTigrinya() throws Exception {
testLanguage("tigrinya.txt", "ti", DEFAULT_DETECTOR);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public class LanguageProfileGenerator {
"[\\p{IsHiragana}\\p{IsKatakana}\\p{IsHan}\\p{IsHangul}\\d\\x{FF10}-\\x{FF19}\\x{FF21}-\\x{FF3A}\\x{FF41}-\\x{FF5A}]");

private static final Pattern PATTERN_MATCH_NUMERICS = Pattern.compile("[\\p{IsDigit}]");
private static final Pattern PATTERN_MULTIPLE_SPACES = Pattern.compile("\\s+");

// Regex to match both opening and closing WikiExtractor <doc> tags, including attributes
private static final Pattern PATTERN_MATCH_WIKI_EXTRACTOR_TAGS =
Expand All @@ -59,7 +60,7 @@ public class LanguageProfileGenerator {
@Ignore
public void generateProfiles() throws Exception {
TreeSet<String> targetCodes =
new TreeSet<>(Set.of("am,az,br,cy,eu,ga,hy,ka,kk,ky,mn,ti,yi".split(",")));
new TreeSet<>(Set.of("am,az,bo,br,cy,eu,ga,hy,ka,kk,ky,mn,ti,yi".split(",")));
System.out.println(
"\nWill generate: ["
+ targetCodes.size()
Expand Down Expand Up @@ -147,7 +148,8 @@ public void updateProfileInChunks(final File file, final LanguageProfile languag
}

final String sanitizedWithoutDigits = sanitize(PATTERN_MATCH_NUMERICS, sanitized);
languageProfile.update(sanitizedWithoutDigits, MIN_GRAM_SIZE, MAX_GRAM_SIZE);
final Matcher matcher = PATTERN_MULTIPLE_SPACES.matcher(sanitizedWithoutDigits);
languageProfile.update(matcher.replaceAll(" "), MIN_GRAM_SIZE, MAX_GRAM_SIZE);
}
}
}
Expand Down Expand Up @@ -186,11 +188,14 @@ private String sanitize(final Pattern pattern, final String input) {
public final void sanityCheckFilteringRegex() {
final String input =
"<doc id=\"599\" url=\"https://mn.wikipedia.org/wiki?curid=599\" title=\"Монгол Улс\">"
+ "これはテスト123abcABCDEアイウエオこんにちは123ABCDE ä, ö, ü, and ß á, à, è, é, û, ù"
+ " "
+ ""
+ "これはテスト123abcABCDEアイウエオこんにちは123ABCDE ä, ö, ü, and ß á, à, è, é, û, ù"
+ "</doc>";

final String noWikiTags = sanitize(PATTERN_MATCH_WIKI_EXTRACTOR_TAGS, input);
assertEquals(noWikiTags, "これはテスト123abcABCDEアイウエオこんにちは123ABCDE ä, ö, ü, and ß á, à, è, é, û, ù");
assertEquals(
noWikiTags, "これはテスト123abcABCDEアイウエオこんにちは123ABCDE ä, ö, ü, and ß á, à, è, é, û, ù");

final String onlyEastAsian = sanitize(PATTERN_MATCH_ANYTHING_NON_EAST_ASIAN, noWikiTags);
assertEquals(onlyEastAsian, "これはテスト123アイウエオこんにちは123ABCDE");
Expand All @@ -199,9 +204,12 @@ public final void sanityCheckFilteringRegex() {
assertEquals(onlyEastAsianWithoutDigits, "これはテストアイウエオこんにちはABCDE");

final String withoutEastAsian = sanitize(PATTERN_MATCH_SPECIFIC_EAST_ASIAN, noWikiTags);
assertEquals(withoutEastAsian, "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");
assertEquals(withoutEastAsian, "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");

final String withoutEastAsianAndDigits = sanitize(PATTERN_MATCH_NUMERICS, withoutEastAsian);
assertEquals(withoutEastAsianAndDigits, "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");
assertEquals(withoutEastAsianAndDigits, "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");

final Matcher matcher = PATTERN_MULTIPLE_SPACES.matcher(withoutEastAsianAndDigits);
assertEquals(matcher.replaceAll(" "), "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");
}
}
1 change: 1 addition & 0 deletions src/test/resources/tibetan.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ཝི་ཀི་རིག་མཛོད་ནི་རང་དབང་གི་རིག་གནས་ཀུན་བཏུས་དཔེ་མཛོད་ཅིག་ཡིན་པས། འདི་རུ་མི་སུས་ཀྱང་རྩོམ་སྒྲིག་གནང་ཆོག ། གལ་ཏེ་ཁྱེད་ཀྱིས་བོད་ཡིག་མཁྱེན་ན་ལམ་སེང་རྩོམ་ཡིག་སྤེལ་ཆོག་པ་མ་ཟད་ནང་དུ་ཉར་ཟིན་པའི་རྩོམ་ཡིག་ཐམས་ཅད་བོད་ཡིག་གཉེར་མཁན་ཚང་མར་ཕན་པས་ཝི་ཀི་བོད་འགྱུར་མའི་ནང་དུ་ཧུར་བརྩོན་གྱིས་རྩོམ་ཡིག་ཁ་སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །

0 comments on commit 202ce83

Please sign in to comment.