-
Notifications
You must be signed in to change notification settings - Fork 91
/
Copy pathlatin.rs
119 lines (108 loc) · 3.51 KB
/
latin.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
use deunicode::deunicode_char;
use crate::detection::Script;
use crate::normalizer::{CharNormalizer, CharOrStr};
use crate::Token;
/// Latin specialized [`Normalizer`] converting unicode chars into Ascii.
///
/// This Normalizer uses [`deunicode`] internally to normalize the provided token.
pub struct LatinNormalizer;
impl CharNormalizer for LatinNormalizer {
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
// if deunicode don't manage to decode the character, we remove it.
let normalized = deunicode_char(c)?;
let mut chars = normalized.chars();
// if the original character is converted in exactly one character,
// then we return the character directly instead of creating a string for it.
match (chars.next(), chars.next()) {
(Some(c), None) => Some(c.into()),
_otherwise => Some(normalized.to_string().into()),
}
}
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && !token.lemma().is_ascii()
}
}
#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;
use crate::normalizer::test::test_normalizer;
use crate::normalizer::{Normalizer, NormalizerOption};
// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("Léopard…".to_string()),
char_end: 8,
byte_end: 11,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("lion".to_string()),
char_end: 4,
byte_end: 4,
script: Script::Latin,
..Default::default()
},
]
}
// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("Leopard...".to_string()),
char_end: 8,
byte_end: 11,
script: Script::Latin,
char_map: Some(vec![
(1, 1),
(2, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 3),
]),
..Default::default()
},
Token {
lemma: Owned("lion".to_string()),
char_end: 4,
byte_end: 4,
script: Script::Latin,
..Default::default()
},
]
}
// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("leopard...".to_string()),
char_end: 8,
byte_end: 11,
script: Script::Latin,
char_map: Some(vec![
(1, 1),
(2, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 3),
]),
..Default::default()
},
Token {
lemma: Owned("lion".to_string()),
char_end: 4,
byte_end: 4,
script: Script::Latin,
..Default::default()
},
]
}
test_normalizer!(LatinNormalizer, tokens(), normalizer_result(), normalized_tokens());
}