[MINOR] Added Tibetan language (#67)

azagniotov · Feb 26, 2025 · 202ce83 · 202ce83
1 parent 51a7a76
commit 202ce83
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ This is a refined and re-implemented version of the archived plugin for ElasticS
 
 The library leverages an n-gram probabilistic model, utilizing n-grams of sizes ranging from 1 to 3, alongside a Bayesian filter that incorporates various normalization techniques and feature sampling methods.
 
-The precision is over **99%** for **69** languages. See the following PR description to read about the benchmaks done by @yanirs : https://github.com/jprante/elasticsearch-langdetect/pull/69
+The precision is over **99%** for **70** languages. See the following PR description to read about the benchmaks done by @yanirs : https://github.com/jprante/elasticsearch-langdetect/pull/69
 
 ### Enhancements over past implementations
 
@@ -121,6 +121,7 @@ The following is a list of ISO 639-1 languages code supported by the library:
 | Tamil            | ta        |
 | Telugu           | te        |
 | Thai             | th        |
+| Tibetan          | bo        |
 | Tigrinya         | ti        |
 | Turkish          | tr        |
 | Ukrainian        | uk        |

diff --git a/latest-version.txt b/latest-version.txt
@@ -1 +1 @@
-5.0.0
+5.1.0
diff --git a/src/main/java/io/github/azagniotov/language/LanguageDetectionSettings.java b/src/main/java/io/github/azagniotov/language/LanguageDetectionSettings.java
@@ -9,7 +9,7 @@
 public class LanguageDetectionSettings {
 
   private static final String ALL_SUPPORTED_ISO_CODES_639_1 =
-      "af,am,ar,az,bg,bn,br,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fr,ga,gu,he,hi,hr,hu,hy,id,it,ja,ka,kk,kn,ko,ky,lb,lt,lv,mk,ml,mn,mr,ne,nl,no,pa,pl,pt,"
+      "af,am,ar,az,bg,bn,bo,br,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fr,ga,gu,he,hi,hr,hu,hy,id,it,ja,ka,kk,kn,ko,ky,lb,lt,lv,mk,ml,mn,mr,ne,nl,no,pa,pl,pt,"
           + "ro,ru,si,sk,sl,so,sq,sv,sw,ta,te,th,ti,tl,tr,uk,ur,vi,yi,zh-cn,zh-tw";
 
   static final LanguageDetectionSettings DEFAULT_SETTINGS_ALL_LANGUAGES =

diff --git a/src/main/resources/langdetect/merged-average/bo b/src/main/resources/langdetect/merged-average/bo
diff --git a/src/test/java/io/github/azagniotov/language/LanguageDetectorTest.java b/src/test/java/io/github/azagniotov/language/LanguageDetectorTest.java
@@ -115,7 +115,7 @@ public void shouldDetectJapaneseDataset() throws Exception {
   public void languageDetectorShortStrings() throws Exception {
     final LanguageDetectionSettings supportedLanguages =
         LanguageDetectionSettings.fromIsoCodes639_1(
-                "az,am,br,cy,de,eu,ga,he,hy,ka,kk,ky,lb,mn,ti,yi")
+                "az,am,bo,br,cy,de,eu,ga,he,hy,ka,kk,ky,lb,mn,ti,yi")
             .build();
     final LanguageDetectorFactory factory = new LanguageDetectorFactory(supportedLanguages);
     final LanguageDetector detector =
@@ -155,6 +155,8 @@ public void languageDetectorShortStrings() throws Exception {
     assertEquals("lb", detector.detectAll("Ech léiere Lëtzebuergesch").get(0).getIsoCode639_1());
     // Mongolian
     assertEquals("mn", detector.detectAll("Би монгол хэл сурч байна").get(0).getIsoCode639_1());
+    // Tibetan
+    assertEquals("bo", detector.detectAll("ངས་བོད་ཡིག་སྦྱོང་གི་ཡོད།").get(0).getIsoCode639_1());
     // Tigrinya
     assertEquals("ti", detector.detectAll("ትግርኛ ይመሃር ኣለኹ").get(0).getIsoCode639_1());
     // Welsh
@@ -243,6 +245,11 @@ public void testMongolian() throws Exception {
     testLanguage("mongolian.txt", "mn", DEFAULT_DETECTOR);
   }
 
+  @Test
+  public void testTibetan() throws Exception {
+    testLanguage("tibetan.txt", "bo", DEFAULT_DETECTOR);
+  }
+
   @Test
   public void testTigrinya() throws Exception {
     testLanguage("tigrinya.txt", "ti", DEFAULT_DETECTOR);

diff --git a/src/test/java/io/github/azagniotov/language/LanguageProfileGenerator.java b/src/test/java/io/github/azagniotov/language/LanguageProfileGenerator.java
@@ -50,6 +50,7 @@ public class LanguageProfileGenerator {
           "[\\p{IsHiragana}\\p{IsKatakana}\\p{IsHan}\\p{IsHangul}\\d\\x{FF10}-\\x{FF19}\\x{FF21}-\\x{FF3A}\\x{FF41}-\\x{FF5A}]");
 
   private static final Pattern PATTERN_MATCH_NUMERICS = Pattern.compile("[\\p{IsDigit}]");
+  private static final Pattern PATTERN_MULTIPLE_SPACES = Pattern.compile("\\s+");
 
   // Regex to match both opening and closing WikiExtractor <doc> tags, including attributes
   private static final Pattern PATTERN_MATCH_WIKI_EXTRACTOR_TAGS =
@@ -59,7 +60,7 @@ public class LanguageProfileGenerator {
   @Ignore
   public void generateProfiles() throws Exception {
     TreeSet<String> targetCodes =
-        new TreeSet<>(Set.of("am,az,br,cy,eu,ga,hy,ka,kk,ky,mn,ti,yi".split(",")));
+        new TreeSet<>(Set.of("am,az,bo,br,cy,eu,ga,hy,ka,kk,ky,mn,ti,yi".split(",")));
     System.out.println(
         "\nWill generate: ["
             + targetCodes.size()
@@ -147,7 +148,8 @@ public void updateProfileInChunks(final File file, final LanguageProfile languag
         }
 
         final String sanitizedWithoutDigits = sanitize(PATTERN_MATCH_NUMERICS, sanitized);
-        languageProfile.update(sanitizedWithoutDigits, MIN_GRAM_SIZE, MAX_GRAM_SIZE);
+        final Matcher matcher = PATTERN_MULTIPLE_SPACES.matcher(sanitizedWithoutDigits);
+        languageProfile.update(matcher.replaceAll(" "), MIN_GRAM_SIZE, MAX_GRAM_SIZE);
       }
     }
   }
@@ -186,11 +188,14 @@ private String sanitize(final Pattern pattern, final String input) {
   public final void sanityCheckFilteringRegex() {
     final String input =
         "<doc id=\"599\" url=\"https://mn.wikipedia.org/wiki?curid=599\" title=\"Монгол Улс\">"
-            + "これはテスト123abcABCDEアイウエオこんにちは１２３ＡＢＣＤＥ ä, ö, ü, and ß á, à, è, é, û, ù"
+            + "      "
+            + ""
+            + "これはテスト123abcABCDEアイウエオこんにちは１２３ＡＢＣＤＥ         ä, ö, ü,  and ß á, à, è, é, û, ù"
             + "</doc>";
 
     final String noWikiTags = sanitize(PATTERN_MATCH_WIKI_EXTRACTOR_TAGS, input);
-    assertEquals(noWikiTags, "これはテスト123abcABCDEアイウエオこんにちは１２３ＡＢＣＤＥ ä, ö, ü, and ß á, à, è, é, û, ù");
+    assertEquals(
+        noWikiTags, "これはテスト123abcABCDEアイウエオこんにちは１２３ＡＢＣＤＥ         ä, ö, ü,  and ß á, à, è, é, û, ù");
 
     final String onlyEastAsian = sanitize(PATTERN_MATCH_ANYTHING_NON_EAST_ASIAN, noWikiTags);
     assertEquals(onlyEastAsian, "これはテスト123アイウエオこんにちは１２３ＡＢＣＤＥ");
@@ -199,9 +204,12 @@ public final void sanityCheckFilteringRegex() {
     assertEquals(onlyEastAsianWithoutDigits, "これはテストアイウエオこんにちはＡＢＣＤＥ");
 
     final String withoutEastAsian = sanitize(PATTERN_MATCH_SPECIFIC_EAST_ASIAN, noWikiTags);
-    assertEquals(withoutEastAsian, "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");
+    assertEquals(withoutEastAsian, "abcABCDE         ä, ö, ü,  and ß á, à, è, é, û, ù");
 
     final String withoutEastAsianAndDigits = sanitize(PATTERN_MATCH_NUMERICS, withoutEastAsian);
-    assertEquals(withoutEastAsianAndDigits, "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");
+    assertEquals(withoutEastAsianAndDigits, "abcABCDE         ä, ö, ü,  and ß á, à, è, é, û, ù");
+
+    final Matcher matcher = PATTERN_MULTIPLE_SPACES.matcher(withoutEastAsianAndDigits);
+    assertEquals(matcher.replaceAll(" "), "abcABCDE ä, ö, ü, and ß á, à, è, é, û, ù");
   }
 }
diff --git a/src/test/resources/tibetan.txt b/src/test/resources/tibetan.txt
@@ -0,0 +1 @@
+ཝི་ཀི་རིག་མཛོད་ནི་རང་དབང་གི་རིག་གནས་ཀུན་བཏུས་དཔེ་མཛོད་ཅིག་ཡིན་པས། འདི་རུ་མི་སུས་ཀྱང་རྩོམ་སྒྲིག་གནང་ཆོག ། གལ་ཏེ་ཁྱེད་ཀྱིས་བོད་ཡིག་མཁྱེན་ན་ལམ་སེང་རྩོམ་ཡིག་སྤེལ་ཆོག་པ་མ་ཟད་ནང་དུ་ཉར་ཟིན་པའི་རྩོམ་ཡིག་ཐམས་ཅད་བོད་ཡིག་གཉེར་མཁན་ཚང་མར་ཕན་པས་ཝི་ཀི་བོད་འགྱུར་མའི་ནང་དུ་ཧུར་བརྩོན་གྱིས་རྩོམ་ཡིག་ཁ་སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		ཝི་ཀི་རིག་མཛོད་ནི་རང་དབང་གི་རིག་གནས་ཀུན་བཏུས་དཔེ་མཛོད་ཅིག་ཡིན་པས། འདི་རུ་མི་སུས་ཀྱང་རྩོམ་སྒྲིག་གནང་ཆོག ། གལ་ཏེ་ཁྱེད་ཀྱིས་བོད་ཡིག་མཁྱེན་ན་ལམ་སེང་རྩོམ་ཡིག་སྤེལ་ཆོག་པ་མ་ཟད་ནང་དུ་ཉར་ཟིན་པའི་རྩོམ་ཡིག་ཐམས་ཅད་བོད་ཡིག་གཉེར་མཁན་ཚང་མར་ཕན་པས་ཝི་ཀི་བོད་འགྱུར་མའི་ནང་དུ་ཧུར་བརྩོན་གྱིས་རྩོམ་ཡིག་ཁ་སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །