withoutboats · pickfire · Jan 5, 2021
diff --git a/src/kebab.rs b/src/kebab.rs
@@ -46,4 +46,13 @@ mod tests {
     t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "this-contains-all-kinds-of-word-boundaries");
     t!(test9: "XΣXΣ baﬄe" => "xσxς-baﬄe");
     t!(test10: "XMLHttpRequest" => "xml-http-request");
+    t!(test11: "ファイルを読み込み" => "ファイルを読み込み");
+    t!(test12: "お前はもう死んでいる！何？" => "お前はもう死んでいる何");
+    t!(test13: "石室诗士施氏，嗜狮，誓食十狮。" => "石室诗士施氏嗜狮誓食十狮");
+    t!(test14: "石室詩士施氏，嗜獅，誓食十獅。" => "石室詩士施氏嗜獅誓食十獅");
+    t!(test15: "ㄕˊㄕˋㄕㄕˋㄕㄕˋ，ㄕˋㄕ，ㄕˋㄕˊㄕˊㄕ。" => "ㄕˊㄕˋㄕㄕˋㄕㄕˋㄕˋㄕㄕˋㄕˊㄕˊㄕ");
+    t!(test16: "shí shì shī shì shī shì ， shì shī ， shì shí shí shī 。" => "shí-shì-shī-shì-shī-shì-shì-shī-shì-shí-shí-shī");
+    t!(test17: "sek6 sat1 si1 si6 si1 si6 ，si3 si1 ，sai6 sik6 sap6 si1 。" => "sek6-sat1-si1-si6-si1-si6-si3-si1-sai6-sik6-sap6-si1");
+    t!(test18: "唱K" => "唱-k"); // this is incorrect but it doesn't matter
+    t!(test19: "YouTube" => "you-tube"); // this is incorrect but it doesn't matter
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -114,7 +114,7 @@ where
                 // Word boundary after if next is underscore or current is
                 // not uppercase and next is uppercase
                 if next == '_' || (next_mode == WordMode::Lowercase && next.is_uppercase()) {
-                    if !first_word {
+                    if !first_word && !continous_script(&word[init..]) {
                         boundary(&mut out);
                     }
                     with_word(&word[init..next_i], &mut out);
@@ -126,7 +126,9 @@ where
                 // is lowercase, word boundary before
                 } else if mode == WordMode::Uppercase && c.is_uppercase() && next.is_lowercase() {
                     if !first_word {
-                        boundary(&mut out);
+                        if !continous_script(&word[init..]) {
+                            boundary(&mut out);
+                        }
                     } else {
                         first_word = false;
                     }
@@ -141,7 +143,9 @@ where
             } else {
                 // Collect trailing characters as a word
                 if !first_word {
-                    boundary(&mut out);
+                    if !continous_script(&word[init..]) {
+                        boundary(&mut out);
+                    }
                 } else {
                     first_word = false;
                 }
@@ -154,6 +158,52 @@ where
     out
 }
 
+/// Check if the first character is part of a continous script
+/// and that it does does not have the concept of casing.
+///
+/// Based on cjk crate, cjk-regex (js) and wikipedia.
+fn continous_script(word: &str) -> bool {
+    // Ignore languages that does not have word boundary.
+    let first_char = word.chars().next();
+    #[allow(clippy::match_like_matches_macro)]
+    match first_char {
+        // Bopomofo
+        Some('\u{3100}'..='\u{312F}') => true,
+        // CJK Unified Ideographs
+        Some('\u{4E00}'..='\u{9FFF}') => true,
+        // CJK Unified Ideographs Extension A
+        Some('\u{3400}'..='\u{4DB5}') => true,
+        // CJK Unified Ideographs Extension B
+        Some('\u{20000}'..='\u{2A6DF}') => true,
+        // CJK Unified Ideographs Extension C
+        Some('\u{2A700}'..='\u{2B73F}') => true,
+        // CJK Unified Ideographs Extension D
+        Some('\u{2B740}'..='\u{2B81F}') => true,
+        // CJK Unified Ideographs Extension E
+        Some('\u{2B820}'..='\u{2CEAF}') => true,
+        // CJK Unified Ideographs Extension F
+        Some('\u{2CEB0}'..='\u{2EBEF}') => true,
+        // CJK Unified Ideographs Extension G
+        Some('\u{30000}'..='\u{3134F}') => true,
+        // CJK Symbols and Punctuation
+        Some('\u{3000}'..='\u{303F}') => true,
+        // CJK Compatibility Ideographs
+        Some('\u{F900}'..='\u{FAFF}') => true,
+        // CJK Compatibility Ideographs Supplement
+        Some('\u{2F800}'..='\u{2FA1F}') => true,
+        // Hiragana
+        Some('\u{3040}'..='\u{309F}') => true,
+        // Katakana
+        Some('\u{30A0}'..='\u{30FF}') => true,
+        // Thai
+        Some('\u{0E00}'..='\u{0E7F}') => true,
+        // Latin and others
+        // Kanbun is used for ordering, so there should be separator
+        // Korean uses spaces in modern days
+        _ => false,
+    }
+}
+
 fn lowercase(s: &str, out: &mut String) {
     let mut chars = s.chars().peekable();
     while let Some(c) = chars.next() {