-
Notifications
You must be signed in to change notification settings - Fork 185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
First draft for implementing Local Displayname Algorithm #3587
Changes from 10 commits
5c33872
17f19d9
0141ac6
4a85149
19ea386
d0d867e
4a00a05
6989da0
6855ceb
0f7c4ac
94b311a
2bc861b
fed1622
a1d9fdc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,9 +7,14 @@ | |
use crate::options::*; | ||
use crate::provider::*; | ||
use alloc::borrow::Cow; | ||
use icu_locid::{subtags::Language, subtags::Region, subtags::Script, subtags::Variant, Locale}; | ||
use alloc::string::String; | ||
use alloc::vec; | ||
use alloc::vec::Vec; | ||
use icu_locid::{ | ||
subtags::Language, subtags::Region, subtags::Script, subtags::Variant, LanguageIdentifier, | ||
Locale, | ||
}; | ||
use icu_provider::prelude::*; | ||
|
||
/// Lookup of the locale-specific display names by region code. | ||
/// | ||
/// # Example | ||
|
@@ -110,7 +115,6 @@ pub struct ScriptDisplayNames { | |
script_data: DataPayload<ScriptDisplayNamesV1Marker>, | ||
} | ||
|
||
#[allow(dead_code)] // not public at the moment | ||
impl ScriptDisplayNames { | ||
icu_provider::gen_any_buffer_data_constructors!( | ||
locale: include, | ||
|
@@ -188,7 +192,6 @@ pub struct VariantDisplayNames { | |
variant_data: DataPayload<VariantDisplayNamesV1Marker>, | ||
} | ||
|
||
#[allow(dead_code)] // not public at the moment | ||
impl VariantDisplayNames { | ||
icu_provider::gen_any_buffer_data_constructors!( | ||
locale: include, | ||
|
@@ -333,9 +336,9 @@ impl LanguageDisplayNames { | |
/// ) | ||
/// .expect("Data should load successfully"); | ||
/// | ||
/// assert_eq!(display_name.of(&locale!("de-CH")), "Swiss High German"); | ||
/// assert_eq!(display_name.of(&locale!("de")), "German"); | ||
/// assert_eq!(display_name.of(&locale!("de-MX")), "German (Mexico)"); | ||
/// assert_eq!(display_name.of(&locale!("en-GB")), "British English"); | ||
/// assert_eq!(display_name.of(&locale!("en")), "English"); | ||
/// assert_eq!(display_name.of(&locale!("en-MX")), "English (Mexico)"); | ||
/// assert_eq!(display_name.of(&locale!("xx-YY")), "xx (YY)"); | ||
/// assert_eq!(display_name.of(&locale!("xx")), "xx"); | ||
/// ``` | ||
|
@@ -345,13 +348,11 @@ pub struct LocaleDisplayNamesFormatter { | |
locale_data: DataPayload<LocaleDisplayNamesV1Marker>, | ||
|
||
language_data: DataPayload<LanguageDisplayNamesV1Marker>, | ||
#[allow(dead_code)] // TODO use this | ||
script_data: DataPayload<ScriptDisplayNamesV1Marker>, | ||
region_data: DataPayload<RegionDisplayNamesV1Marker>, | ||
#[allow(dead_code)] // TODO add support for variants | ||
variant_data: DataPayload<VariantDisplayNamesV1Marker>, | ||
// key_data: DataPayload<KeyDisplayNamesV1Marker>, | ||
// measuerment_data: DataPayload<MeasurementSystemsDisplayNamesV1Marker>, | ||
// measurement_data: DataPayload<MeasurementSystemsDisplayNamesV1Marker>, | ||
// subdivisions_data: DataPayload<SubdivisionsDisplayNamesV1Marker>, | ||
// transforms_data: DataPayload<TransformsDisplayNamesV1Marker>, | ||
} | ||
|
@@ -404,90 +405,216 @@ impl LocaleDisplayNamesFormatter { | |
} | ||
|
||
/// Returns the display name of a locale. | ||
/// This implementation is based on the algorithm described in | ||
/// https://www.unicode.org/reports/tr35/tr35-general.html#locale_display_name_algorithm | ||
/// | ||
// TODO: Make this return a writeable instead of using alloc | ||
pub fn of<'a, 'b: 'a, 'c: 'a>(&'b self, locale: &'c Locale) -> Cow<'a, str> { | ||
// https://www.unicode.org/reports/tr35/tr35-general.html#Display_Name_Elements | ||
|
||
// TODO: This binary search needs to return the longest matching found prefix | ||
// instead of just perfect matches | ||
if let Some(displayname) = match self.options.style { | ||
Some(Style::Short) => self | ||
.locale_data | ||
.get() | ||
.short_names | ||
.get_by(|bytes| locale.strict_cmp(bytes).reverse()), | ||
Some(Style::Long) => self | ||
.locale_data | ||
.get() | ||
.long_names | ||
.get_by(|bytes| locale.strict_cmp(bytes).reverse()), | ||
Some(Style::Menu) => self | ||
.locale_data | ||
.get() | ||
.menu_names | ||
.get_by(|bytes| locale.strict_cmp(bytes).reverse()), | ||
_ => None, | ||
let longest_matching_identifier = self.find_longest_matching_subtag(&locale); | ||
|
||
// Step - 1: Construct a locale display name string (LDN). | ||
// Find the displayname for the longest_matching_subtag which was derived above. | ||
let ldn = self.get_locale_display_name(&locale, &longest_matching_identifier); | ||
|
||
// Step - 2: Construct a vector of longest qualifying substrings (LQS). | ||
// Find the displayname for the remaining locale if exists. | ||
let lqs = self.get_longest_qualifying_substrings(&locale, &longest_matching_identifier); | ||
|
||
// Step - 3: Return the displayname based on the size of LQS. | ||
let mut result = Cow::Borrowed(ldn); | ||
if lqs.len() > 0 { | ||
let mut output = String::with_capacity( | ||
result.len() + " (".len() + lqs.iter().map(|s| ", ".len() + s.len()).sum::<usize>() | ||
- ", ".len() | ||
+ ")".len(), | ||
); | ||
output.push_str(&result); | ||
output.push_str(" ("); | ||
output.push_str(lqs[0]); | ||
for lqs in &lqs[1..] { | ||
output.push_str(", "); | ||
output.push_str(lqs); | ||
} | ||
output.push_str(")"); | ||
result = Cow::Owned(output); | ||
} | ||
.or_else(|| { | ||
self.locale_data | ||
result | ||
} | ||
|
||
/// For a given locale and the data, find the longest prefix of the string that exists as a key in the CLDR locale data. | ||
pub fn find_longest_matching_subtag(&self, locale: &Locale) -> LanguageIdentifier { | ||
Comment on lines
+445
to
+446
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Observation: this only ever returns one of the following:
But does the spec allow us to return language-script-region, etc? Consider this in #3913 |
||
let LocaleDisplayNamesFormatter { locale_data, .. } = self; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I personally would just use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
// NOTE: The subtag ordering of the canonical locale is `language_script_region + variants + extensions`. | ||
// The logic to find the longest matching subtag is based on this ordering. | ||
if let Some(script) = locale.id.script { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't you technically have to try all these combinations according to the algorithm?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no explicit mention in the algorithm for what is covered under LocaleDisplayName (LDN). But because we are looking for the longest matching string in the locale and language data, I couldn't find any example in the data which has any other combinations then what is already covered in this implementation. However, we do need to technically support the other combinations as the data may change in future. I think a better way to implement the support for this is to use the subtag iterator.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If what we have is more efficient than the general solution, I'm okay landing this and fixing the algorithm later if these types of cases come up. The way you've written this, I think old-code-new-data should just ignore the new entries, which is fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Chopping off subtags will only cover LSRV, LSR, LS, L. |
||
let lang_script_identifier: LanguageIdentifier = | ||
(locale.id.language, Some(script), None).into(); | ||
if locale_data | ||
.get() | ||
.names | ||
.get_by(|bytes| locale.strict_cmp(bytes).reverse()) | ||
}) { | ||
return Cow::Borrowed(displayname); | ||
.get_by(|uvstr| lang_script_identifier.strict_cmp(uvstr).reverse()) | ||
.is_some() | ||
{ | ||
return lang_script_identifier; | ||
} | ||
} | ||
if let Some(region) = locale.id.region { | ||
if locale.id.script.is_none() { | ||
let lang_region_identifier: LanguageIdentifier = | ||
(locale.id.language, None, Some(region)).into(); | ||
if locale_data | ||
.get() | ||
.names | ||
.get_by(|uvstr| lang_region_identifier.strict_cmp(uvstr).reverse()) | ||
.is_some() | ||
{ | ||
return lang_region_identifier; | ||
} | ||
} | ||
} | ||
return (locale.id.language, None, None).into(); | ||
} | ||
|
||
// TODO: This is a dummy implementation which does not adhere to UTS35. It only uses | ||
// the language and region code, and uses a hardcoded pattern to combine them. | ||
fn get_locale_display_name<'a>( | ||
&'a self, | ||
locale: &'a Locale, | ||
longest_matching_identifier: &LanguageIdentifier, | ||
) -> &'a str { | ||
let LocaleDisplayNamesFormatter { | ||
options, | ||
locale_data, | ||
language_data, | ||
.. | ||
} = self; | ||
|
||
let langdisplay = match self.options.style { | ||
Some(Style::Short) => self | ||
.language_data | ||
// Check if the key exists in the locale_data first. | ||
// Example: "en_GB", "nl_BE". | ||
let mut ldn = match options.style { | ||
Some(Style::Short) => locale_data | ||
.get() | ||
.short_names | ||
.get(&locale.id.language.into_tinystr().to_unvalidated()), | ||
Some(Style::Long) => self | ||
.language_data | ||
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()), | ||
Some(Style::Long) => locale_data | ||
.get() | ||
.long_names | ||
.get(&locale.id.language.into_tinystr().to_unvalidated()), | ||
Some(Style::Menu) => self | ||
.language_data | ||
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()), | ||
Some(Style::Menu) => locale_data | ||
.get() | ||
.menu_names | ||
.get(&locale.id.language.into_tinystr().to_unvalidated()), | ||
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()), | ||
_ => None, | ||
} | ||
.or_else(|| { | ||
self.language_data | ||
locale_data | ||
.get() | ||
.names | ||
.get(&locale.id.language.into_tinystr().to_unvalidated()) | ||
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()) | ||
}); | ||
|
||
if let Some(region) = locale.id.region { | ||
let regiondisplay = match self.options.style { | ||
Some(Style::Short) => self | ||
.region_data | ||
.get() | ||
.short_names | ||
.get(®ion.into_tinystr().to_unvalidated()), | ||
// At this point the key should exist in the language_data. | ||
// Example: "en", "nl", "zh". | ||
if ldn.is_none() { | ||
ldn = match options.style { | ||
Some(Style::Short) => language_data.get().short_names.get( | ||
&longest_matching_identifier | ||
.language | ||
.into_tinystr() | ||
.to_unvalidated(), | ||
), | ||
Some(Style::Long) => language_data.get().long_names.get( | ||
&longest_matching_identifier | ||
.language | ||
.into_tinystr() | ||
.to_unvalidated(), | ||
), | ||
Some(Style::Menu) => language_data.get().menu_names.get( | ||
&longest_matching_identifier | ||
.language | ||
.into_tinystr() | ||
.to_unvalidated(), | ||
), | ||
_ => None, | ||
} | ||
.or_else(|| { | ||
self.region_data | ||
language_data.get().names.get( | ||
&longest_matching_identifier | ||
.language | ||
.into_tinystr() | ||
.to_unvalidated(), | ||
) | ||
}); | ||
} | ||
// Throw an error if the LDN is none as it is not possible to have a locale string without the language. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: update comment There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
return ldn.unwrap_or(locale.id.language.as_str()); | ||
} | ||
|
||
fn get_longest_qualifying_substrings<'a>( | ||
&'a self, | ||
locale: &'a Locale, | ||
longest_matching_identifier: &'a LanguageIdentifier, | ||
) -> Vec<&'a str> { | ||
let LocaleDisplayNamesFormatter { | ||
options, | ||
region_data, | ||
script_data, | ||
variant_data, | ||
.. | ||
} = self; | ||
|
||
let mut lqs: Vec<&'a str> = vec![]; | ||
|
||
if let Some(script) = &locale.id.script { | ||
// Ignore if the script was used to derive LDN. | ||
if longest_matching_identifier.script.is_none() { | ||
let scriptdisplay = match options.style { | ||
Some(Style::Short) => script_data | ||
.get() | ||
.short_names | ||
.get(&script.into_tinystr().to_unvalidated()), | ||
_ => None, | ||
} | ||
.or_else(|| { | ||
script_data | ||
.get() | ||
.names | ||
.get(&script.into_tinystr().to_unvalidated()) | ||
}); | ||
lqs.push(scriptdisplay.unwrap_or(script.as_str())); | ||
} | ||
} | ||
|
||
if let Some(region) = &locale.id.region { | ||
// Ignore if the region was used to derive LDN. | ||
if longest_matching_identifier.region.is_none() { | ||
let regiondisplay = match options.style { | ||
Some(Style::Short) => region_data | ||
.get() | ||
.short_names | ||
.get(®ion.into_tinystr().to_unvalidated()), | ||
_ => None, | ||
} | ||
.or_else(|| { | ||
region_data | ||
.get() | ||
.names | ||
.get(®ion.into_tinystr().to_unvalidated()) | ||
}); | ||
|
||
lqs.push(regiondisplay.unwrap_or(region.as_str())); | ||
} | ||
} | ||
|
||
for variant_key in locale.id.variants.iter() { | ||
lqs.push( | ||
variant_data | ||
.get() | ||
.names | ||
.get(®ion.into_tinystr().to_unvalidated()) | ||
}); | ||
// TODO: Use data patterns | ||
Cow::Owned(alloc::format!( | ||
"{} ({})", | ||
langdisplay.unwrap_or(locale.id.language.as_str()), | ||
regiondisplay.unwrap_or(region.as_str()) | ||
)) | ||
} else { | ||
Cow::Borrowed(langdisplay.unwrap_or(locale.id.language.as_str())) | ||
.get(&variant_key.into_tinystr().to_unvalidated()) | ||
.unwrap_or(variant_key.as_str()), | ||
); | ||
} | ||
|
||
return lqs; | ||
} | ||
} |
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
use icu_displaynames::{DisplayNamesOptions, LocaleDisplayNamesFormatter}; | ||
use icu_locid::locale; | ||
use icu_locid::Locale; | ||
use std::str::FromStr; | ||
|
||
#[test] | ||
fn test_concatenate() { | ||
#[derive(Debug)] | ||
struct TestCase<'a> { | ||
pub input_1: &'a Locale, | ||
pub expected: &'a str, | ||
} | ||
let cases = [ | ||
sffc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
TestCase { | ||
input_1: &locale!("de-CH"), | ||
expected: "Swiss High German", | ||
}, | ||
TestCase { | ||
input_1: &locale!("zh_Hans"), | ||
expected: "Simplified Chinese", | ||
}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thought: one other case could be "es-Latn-419" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
TestCase { | ||
input_1: &locale!("es-419"), | ||
expected: "Latin American Spanish", | ||
}, | ||
TestCase { | ||
input_1: &locale!("es-Cyrl-MX"), | ||
expected: "Spanish (Cyrillic, Mexico)", | ||
}, | ||
TestCase { | ||
input_1: &Locale::from_str("en-Latn-GB-fonipa-scouse").unwrap(), | ||
expected: "English (Latin, United Kingdom, IPA Phonetics, Scouse)", | ||
}, | ||
TestCase { | ||
input_1: &Locale::from_str("de-Latn-CH").unwrap(), | ||
expected: "German (Latin, Switzerland)", | ||
}, | ||
TestCase { | ||
input_1: &Locale::from_str("zh-Hans-CN").unwrap(), | ||
expected: "Simplified Chinese (China)", | ||
}, | ||
TestCase { | ||
input_1: &Locale::from_str("es-419-fonipa").unwrap(), | ||
expected: "Latin American Spanish (IPA Phonetics)", | ||
}, | ||
TestCase { | ||
input_1: &locale!("xx-YY"), | ||
expected: "xx (YY)", | ||
}, | ||
]; | ||
for cas in &cases { | ||
// TODO: Add tests for different data locales. | ||
let locale = locale!("en-001"); | ||
let options: DisplayNamesOptions = Default::default(); | ||
|
||
let display_name = LocaleDisplayNamesFormatter::try_new(&locale.into(), options) | ||
.expect("Data should load successfully"); | ||
|
||
assert_eq!(display_name.of(cas.input_1), cas.expected); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These strings should come from locale data. Could be a follow-up but we definitely need this before this goes stable.
https://github.com/unicode-org/cldr-json/blob/80a94b0f6c3a34d6e2dc0dca8639a54babc87f94/cldr-json/cldr-localenames-full/main/zh/localeDisplayNames.json#L12C1-L14C44
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Acknowledged
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#3913