meilisearch · bors · Nov 28, 2022 · Nov 27, 2022 · Nov 27, 2022 · Nov 27, 2022
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,6 +27,7 @@ pinyin = { version = "0.9", default-features = false, features = [
   "with_tone",
 ], optional = true }
 wana_kana = { version = "2.1.0", optional = true }
+unicode-normalization = "0.1.22"
 
 [features]
 default = ["chinese", "hebrew", "japanese", "thai", "korean"]

diff --git a/src/normalizer/compatibility_decomposition.rs b/src/normalizer/compatibility_decomposition.rs
@@ -0,0 +1,150 @@
+use std::iter::once;
+
+use super::{CharNormalizer, CharOrStr};
+use crate::Token;
+use unicode_normalization::{is_nfkd_quick, UnicodeNormalization};
+
+/// A global [`Normalizer`] normalizing to the Unicode Normalization Form KD.
+///
+/// This Normalizer uses [`unicode-normalization::nfkd`] internally to normalize the provided token.
+///
+/// The Unicode Normalization Form KD (NFKD) is the Compatibility Decomposition normalization, see
+/// <https://www.unicode.org/reports/tr15/#Norm_Forms> for more information.
+pub struct CompatibilityDecompositionNormalizer;
+
+impl CharNormalizer for CompatibilityDecompositionNormalizer {
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        let mut normalized = c.nfkd();
+
+        // if the original character is converted in exactly one character,
+        // then we return the character directly instead of creating a string for it.
+        match (normalized.next(), normalized.next()) {
+            (Some(c), None) => Some(c.into()),
+            (Some(first), Some(second)) => {
+                let normalized: String =
+                    once(first).chain(once(second)).chain(normalized).collect();
+                Some(normalized.into())
+            }
+            (None, _) => None,
+        }
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        !(token.lemma().is_ascii()
+            || matches!(
+                is_nfkd_quick(token.lemma().chars()),
+                unicode_normalization::IsNormalized::Yes
+            ))
+    }
+}
+
+// Test the normalizer:
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::Normalizer;
+    use crate::normalizer::NormalizerOption;
+    use crate::Language;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                // Decompose 1E69 to 0073 0323 0307
+                lemma: Owned("ṩ ṩ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ｶﾞｷﾞｸﾞｹﾞｺﾞ".to_string()),
+                char_end: "ｶﾞｷﾞｸﾞｹﾞｺﾞ".chars().count(),
+                byte_end: "ｶﾞｷﾞｸﾞｹﾞｺﾞ".len(),
+                script: Script::Cj,
+                language: Some(Language::Jpn),
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                char_map: Some(vec![(3, 5), (1, 1), (3, 5)]),
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("カ\u{3099}キ\u{3099}ク\u{3099}ケ\u{3099}コ\u{3099}".to_string()),
+                char_end: "ｶﾞｷﾞｸﾞｹﾞｺﾞ".chars().count(),
+                byte_end: "ｶﾞｷﾞｸﾞｹﾞｺﾞ".len(),
+                script: Script::Cj,
+                char_map: Some(vec![
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                ]),
+                language: Some(Language::Jpn),
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("s s".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(3, 1), (1, 1), (3, 1)]),
+                ..Default::default()
+            },
+            Token {
+                #[cfg(feature = "japanese-transliteration")]
+                lemma: Owned("か\u{3099}き\u{3099}く\u{3099}け\u{3099}こ\u{3099}".to_string()),
+                #[cfg(not(feature = "japanese-transliteration"))]
+                lemma: Owned("カ\u{3099}キ\u{3099}ク\u{3099}ケ\u{3099}コ\u{3099}".to_string()),
+                char_end: "ｶﾞｷﾞｸﾞｹﾞｺﾞ".chars().count(),
+                byte_end: "ｶﾞｷﾞｸﾞｹﾞｺﾞ".len(),
+                script: Script::Cj,
+                language: Some(Language::Jpn),
+                char_map: Some(vec![
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                ]),
+                ..Default::default()
+            },
+        ]
+    }
+
+    test_normalizer!(
+        CompatibilityDecompositionNormalizer,
+        tokens(),
+        normalizer_result(),
+        normalized_tokens()
+    );
+}
diff --git a/src/normalizer/japanese.rs b/src/normalizer/japanese.rs
@@ -116,26 +116,37 @@ mod test {
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
-                lemma: Owned("だめ".to_string()),
+                lemma: Owned("た\u{3099}め".to_string()),
                 char_end: 2,
                 byte_end: 6,
+                char_map: Some(vec![(3, 6), (3, 3)]),
                 script: Script::Cj,
                 language: Some(Language::Jpn),
                 ..Default::default()
             },
             Token {
-                lemma: Owned("だめ".to_string()),
+                lemma: Owned("た\u{3099}め".to_string()),
                 char_end: 2,
                 byte_end: 6,
-                char_map: Some(vec![(3, 3), (3, 3)]),
+                char_map: Some(vec![(3, 6), (3, 3)]),
                 script: Script::Cj,
                 language: Some(Language::Jpn),
                 ..Default::default()
             },
             Token {
-                lemma: Owned("だめ駄目だめHi".to_string()),
+                lemma: Owned("た\u{3099}め駄目た\u{3099}めHi".to_string()),
                 char_end: 8,
                 byte_end: 20,
+                char_map: Some(vec![
+                    (3, 6),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 6),
+                    (3, 3),
+                    (1, 1),
+                    (1, 1),
+                ]),
                 script: Script::Cj,
                 language: Some(Language::Jpn),
                 ..Default::default()

diff --git a/src/normalizer/latin.rs b/src/normalizer/latin.rs
diff --git a/src/normalizer/mod.rs b/src/normalizer/mod.rs
@@ -4,32 +4,32 @@ use once_cell::sync::Lazy;
 
 #[cfg(feature = "chinese")]
 pub use self::chinese::ChineseNormalizer;
+pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer;
 pub use self::control_char::ControlCharNormalizer;
 #[cfg(feature = "japanese-transliteration")]
 pub use self::japanese::JapaneseNormalizer;
-pub use self::latin::LatinNormalizer;
 pub use self::lowercase::LowercaseNormalizer;
 use crate::normalizer::nonspacing_mark::NonspacingMarkNormalizer;
 use crate::Token;
 
 #[cfg(feature = "chinese")]
 mod chinese;
+mod compatibility_decomposition;
 mod control_char;
 #[cfg(feature = "japanese-transliteration")]
 mod japanese;
-mod latin;
 mod lowercase;
 mod nonspacing_mark;
 
 /// List of [`Normalizer`]s used by [`Normalize::normalize`].
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
     vec![
+        Box::new(CompatibilityDecompositionNormalizer),
         Box::new(LowercaseNormalizer),
         #[cfg(feature = "chinese")]
         Box::new(ChineseNormalizer),
         #[cfg(feature = "japanese-transliteration")]
         Box::new(JapaneseNormalizer),
-        Box::new(LatinNormalizer),
         Box::new(ControlCharNormalizer),
         Box::new(NonspacingMarkNormalizer),
     ]

diff --git a/src/normalizer/nonspacing_mark.rs b/src/normalizer/nonspacing_mark.rs
@@ -26,7 +26,7 @@ impl CharNormalizer for NonspacingMarkNormalizer {
     }
 
     fn should_normalize(&self, token: &Token) -> bool {
-        matches!(token.script, Script::Hebrew | Script::Thai | Script::Arabic)
+        matches!(token.script, Script::Hebrew | Script::Thai | Script::Arabic | Script::Latin)
             && token.lemma().chars().any(is_nonspacing_mark)
     }
 }
@@ -112,7 +112,7 @@ mod test {
                 ..Default::default()
             },
             Token {
-                lemma: Owned("أب".to_string()),
+                lemma: Owned("اب".to_string()),
                 char_end: "أَب".chars().count(),
                 byte_end: "أَب".len(),
                 char_map: Some(vec![(2, 2), (2, 0), (2, 2)]),

diff --git a/src/segmenter/chinese.rs b/src/segmenter/chinese.rs
@@ -67,7 +67,7 @@ mod test {
     const TOKENIZED: &[&str] = &[
         "rénrén",
         "shēngérzìyóu",
-        "﹐",
+        ",",
         "zài",
         "zūn",
         "yán",
@@ -84,7 +84,7 @@ mod test {
         "lǐxìng",
         "hé",
         "liángxīn",
-        "﹐",
+        ",",
         "bìng",
         "yīng",
         "yǐ",