Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a Compatibility Decomposition Normalizer, remove Latin normalizer #166

Merged
merged 4 commits into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pinyin = { version = "0.9", default-features = false, features = [
"with_tone",
], optional = true }
wana_kana = { version = "2.1.0", optional = true }
unicode-normalization = "0.1.22"

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean"]
Expand Down
150 changes: 150 additions & 0 deletions src/normalizer/compatibility_decomposition.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
use std::iter::once;

use super::{CharNormalizer, CharOrStr};
use crate::Token;
use unicode_normalization::{is_nfkd_quick, UnicodeNormalization};

/// A global [`Normalizer`] normalizing to the Unicode Normalization Form KD.
///
/// This Normalizer uses [`unicode-normalization::nfkd`] internally to normalize the provided token.
///
/// The Unicode Normalization Form KD (NFKD) is the Compatibility Decomposition normalization, see
/// <https://www.unicode.org/reports/tr15/#Norm_Forms> for more information.
pub struct CompatibilityDecompositionNormalizer;

impl CharNormalizer for CompatibilityDecompositionNormalizer {
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
let mut normalized = c.nfkd();

// if the original character is converted in exactly one character,
// then we return the character directly instead of creating a string for it.
match (normalized.next(), normalized.next()) {
(Some(c), None) => Some(c.into()),
(Some(first), Some(second)) => {
let normalized: String =
once(first).chain(once(second)).chain(normalized).collect();
Some(normalized.into())
}
(None, _) => None,
}
}

fn should_normalize(&self, token: &Token) -> bool {
!(token.lemma().is_ascii()
|| matches!(
is_nfkd_quick(token.lemma().chars()),
unicode_normalization::IsNormalized::Yes
))
}
}

// Test the normalizer:
#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;

use crate::normalizer::test::test_normalizer;
use crate::normalizer::Normalizer;
use crate::normalizer::NormalizerOption;
use crate::Language;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
// Decompose 1E69 to 0073 0323 0307
lemma: Owned("ṩ ṩ".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("ガギグゲゴ".to_string()),
char_end: "ガギグゲゴ".chars().count(),
byte_end: "ガギグゲゴ".len(),
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
]
}

// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()),
char_end: 2,
byte_end: 2,
char_map: Some(vec![(3, 5), (1, 1), (3, 5)]),
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("カ\u{3099}キ\u{3099}ク\u{3099}ケ\u{3099}コ\u{3099}".to_string()),
char_end: "ガギグゲゴ".chars().count(),
byte_end: "ガギグゲゴ".len(),
script: Script::Cj,
char_map: Some(vec![
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
]),
language: Some(Language::Jpn),
..Default::default()
},
]
}

// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("s s".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(3, 1), (1, 1), (3, 1)]),
..Default::default()
},
Token {
#[cfg(feature = "japanese-transliteration")]
lemma: Owned("か\u{3099}き\u{3099}く\u{3099}け\u{3099}こ\u{3099}".to_string()),
#[cfg(not(feature = "japanese-transliteration"))]
lemma: Owned("カ\u{3099}キ\u{3099}ク\u{3099}ケ\u{3099}コ\u{3099}".to_string()),
char_end: "ガギグゲゴ".chars().count(),
byte_end: "ガギグゲゴ".len(),
script: Script::Cj,
language: Some(Language::Jpn),
char_map: Some(vec![
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
]),
..Default::default()
},
]
}

test_normalizer!(
CompatibilityDecompositionNormalizer,
tokens(),
normalizer_result(),
normalized_tokens()
);
}
19 changes: 15 additions & 4 deletions src/normalizer/japanese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,26 +116,37 @@ mod test {
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("だめ".to_string()),
lemma: Owned("た\u{3099}め".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 6), (3, 3)]),
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
Token {
lemma: Owned("だめ".to_string()),
lemma: Owned("た\u{3099}め".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 3), (3, 3)]),
char_map: Some(vec![(3, 6), (3, 3)]),
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
Token {
lemma: Owned("だめ駄目だめHi".to_string()),
lemma: Owned("た\u{3099}め駄目た\u{3099}めHi".to_string()),
char_end: 8,
byte_end: 20,
char_map: Some(vec![
(3, 6),
(3, 3),
(3, 3),
(3, 3),
(3, 6),
(3, 3),
(1, 1),
(1, 1),
]),
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
Expand Down
119 changes: 0 additions & 119 deletions src/normalizer/latin.rs

This file was deleted.

6 changes: 3 additions & 3 deletions src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,32 @@ use once_cell::sync::Lazy;

#[cfg(feature = "chinese")]
pub use self::chinese::ChineseNormalizer;
pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer;
pub use self::control_char::ControlCharNormalizer;
#[cfg(feature = "japanese-transliteration")]
pub use self::japanese::JapaneseNormalizer;
pub use self::latin::LatinNormalizer;
pub use self::lowercase::LowercaseNormalizer;
use crate::normalizer::nonspacing_mark::NonspacingMarkNormalizer;
use crate::Token;

#[cfg(feature = "chinese")]
mod chinese;
mod compatibility_decomposition;
mod control_char;
#[cfg(feature = "japanese-transliteration")]
mod japanese;
mod latin;
mod lowercase;
mod nonspacing_mark;

/// List of [`Normalizer`]s used by [`Normalize::normalize`].
pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
vec![
Box::new(CompatibilityDecompositionNormalizer),
Box::new(LowercaseNormalizer),
#[cfg(feature = "chinese")]
Box::new(ChineseNormalizer),
#[cfg(feature = "japanese-transliteration")]
Box::new(JapaneseNormalizer),
Box::new(LatinNormalizer),
Box::new(ControlCharNormalizer),
Box::new(NonspacingMarkNormalizer),
]
Expand Down
4 changes: 2 additions & 2 deletions src/normalizer/nonspacing_mark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ impl CharNormalizer for NonspacingMarkNormalizer {
}

fn should_normalize(&self, token: &Token) -> bool {
matches!(token.script, Script::Hebrew | Script::Thai | Script::Arabic)
matches!(token.script, Script::Hebrew | Script::Thai | Script::Arabic | Script::Latin)
&& token.lemma().chars().any(is_nonspacing_mark)
}
}
Expand Down Expand Up @@ -112,7 +112,7 @@ mod test {
..Default::default()
},
Token {
lemma: Owned("أب".to_string()),
lemma: Owned("اب".to_string()),
char_end: "أَب".chars().count(),
byte_end: "أَب".len(),
char_map: Some(vec![(2, 2), (2, 0), (2, 2)]),
Expand Down
4 changes: 2 additions & 2 deletions src/segmenter/chinese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ mod test {
const TOKENIZED: &[&str] = &[
"rénrén",
"shēngérzìyóu",
"",
",",
"zài",
"zūn",
"yán",
Expand All @@ -84,7 +84,7 @@ mod test {
"lǐxìng",
"hé",
"liángxīn",
"",
",",
"bìng",
"yīng",
"yǐ",
Expand Down
Loading