Skip to content

Commit

Permalink
Locale canonicalize -> normalize (#5766)
Browse files Browse the repository at this point in the history
Fixes #2748
  • Loading branch information
robertbastian authored Nov 3, 2024
1 parent 983ce33 commit 3ba709c
Show file tree
Hide file tree
Showing 17 changed files with 58 additions and 72 deletions.
2 changes: 1 addition & 1 deletion components/locale_core/benches/helpers/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ macro_rules! canonicalize {
let _ = black_box(s).to_string();
}
for s in $data {
let _ = $struct::canonicalize(black_box(s));
let _ = $struct::normalize(black_box(s));
}
})
});
Expand Down
2 changes: 1 addition & 1 deletion components/locale_core/benches/iai_langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ fn bench_langid_canonicalize() {

let _: Vec<Cow<str>> = LIDS_STR
.iter()
.map(|l| LanguageIdentifier::canonicalize(l).expect("Canonicalization failed"))
.map(|l| LanguageIdentifier::normalize(l).expect("Normalization failed"))
.collect();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ fn main() {
.split(',')
.map(str::trim)
{
let output = Locale::canonicalize(input).unwrap();
let output = Locale::normalize(input).unwrap();
println!("{input} -> {output}");
}
}
32 changes: 13 additions & 19 deletions components/locale_core/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,47 +167,41 @@ impl LanguageIdentifier {
&& self.variants.is_empty()
}

/// Canonicalize the language identifier (operating on UTF-8 formatted byte slices)
/// Normalize the language identifier (operating on UTF-8 formatted byte slices)
///
/// This is a best-effort operation that performs all available levels of canonicalization.
///
/// At the moment the operation will normalize casing and the separator, but in the future
/// it may also validate and update from deprecated subtags to canonical ones.
/// This operation will normalize casing and the separator.
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// assert_eq!(
/// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
/// LanguageIdentifier::normalize("pL_latn_pl").as_deref(),
/// Ok("pl-Latn-PL")
/// );
/// ```
pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
let lang_id = Self::try_from_utf8(input)?;
Ok(writeable::to_string_or_borrow(&lang_id, input))
}

/// Canonicalize the language identifier (operating on strings)
///
/// This is a best-effort operation that performs all available levels of canonicalization.
/// Normalize the language identifier (operating on strings)
///
/// At the moment the operation will normalize casing and the separator, but in the future
/// it may also validate and update from deprecated subtags to canonical ones.
/// This operation will normalize casing and the separator.
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// assert_eq!(
/// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
/// LanguageIdentifier::normalize("pL_latn_pl").as_deref(),
/// Ok("pl-Latn-PL")
/// );
/// ```
pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
Self::canonicalize_utf8(input.as_bytes())
pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
Self::normalize_utf8(input.as_bytes())
}

/// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
Expand Down Expand Up @@ -371,14 +365,14 @@ impl LanguageIdentifier {
/// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
/// lowercase ascii form.
///
/// The default canonicalization of language identifiers uses titlecase scripts and uppercase
/// The default normalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// > ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// canonicalization of the language identifier.
/// normalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
Expand Down Expand Up @@ -406,14 +400,14 @@ impl LanguageIdentifier {
/// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
/// lowercase ascii chars.
///
/// The default canonicalization of language identifiers uses titlecase scripts and uppercase
/// The default normalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// > ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// canonicalization of the language identifier.
/// normalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
Expand Down
22 changes: 9 additions & 13 deletions components/locale_core/src/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,45 +151,41 @@ impl Locale {
}
}

/// Canonicalize the locale (operating on UTF-8 formatted byte slices)
/// Normalize the locale (operating on UTF-8 formatted byte slices)
///
/// This is a best-effort operation that performs all available levels of canonicalization.
///
/// At the moment the operation will normalize casing and the separator, but in the future
/// it may also validate and update from deprecated subtags to canonical ones.
/// This operation will normalize casing and the separator.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::canonicalize_utf8(b"pL_latn_pl-U-HC-H12").as_deref(),
/// Locale::normalize_utf8(b"pL_latn_pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
let locale = Self::try_from_utf8(input)?;
Ok(writeable::to_string_or_borrow(&locale, input))
}

/// Canonicalize the locale (operating on strings)
/// Normalize the locale (operating on strings)
///
/// At the moment the operation will normalize casing and the separator, but in the future
/// it may also validate and update from deprecated subtags to canonical ones.
/// This operation will normalize casing and the separator.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
/// Locale::normalize("pL_latn_pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
Self::canonicalize_utf8(input.as_bytes())
pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
Self::normalize_utf8(input.as_bytes())
}

/// Compare this [`Locale`] with BCP-47 bytes.
Expand Down
4 changes: 2 additions & 2 deletions components/locale_core/src/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

/// A macro allowing for compile-time construction of valid [`LanguageIdentifier`]s.
///
/// The macro will perform syntax canonicalization of the tag.
/// The macro will perform syntax normalization of the tag.
///
/// # Examples
///
Expand Down Expand Up @@ -57,7 +57,7 @@ macro_rules! langid {

/// A macro allowing for compile-time construction of valid [`Locale`]s.
///
/// The macro will perform syntax canonicalization of the tag.
/// The macro will perform syntax normalization of the tag.
///
/// # Examples
///
Expand Down
8 changes: 4 additions & 4 deletions components/locale_core/src/subtags/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
//! * [`Variants`] is a list of optional [`Variant`] subtags containing information about the
//! variant adjustments used by the locale.
//!
//! Subtags can be used in isolation, and all basic operations such as parsing, syntax canonicalization
//! Subtags can be used in isolation, and all basic operations such as parsing, syntax normalization
//! and serialization are supported on each individual subtag, but most commonly
//! they are used to construct a [`LanguageIdentifier`] instance.
//!
//! [`Variants`] is a special structure which contains a list of [`Variant`] subtags.
//! It is wrapped around to allow for sorting and deduplication of variants, which
//! is one of the required steps of language identifier and locale syntax canonicalization.
//! is one of the required steps of language identifier and locale syntax normalization.
//!
//! # Examples
//!
Expand All @@ -40,8 +40,8 @@
//! assert_eq!(variant.as_str(), "macos");
//! ```
//!
//! `Notice`: The subtags are canonicalized on parsing. That means
//! that all operations work on a canonicalized version of the subtag
//! `Notice`: The subtags are normalized on parsing. That means
//! that all operations work on a normalized version of the subtag
//! and serialization is very cheap.
//!
//! [`LanguageIdentifier`]: super::LanguageIdentifier
Expand Down
4 changes: 2 additions & 2 deletions ffi/capi/bindings/c/Locale.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion ffi/capi/bindings/cpp/icu4x/Locale.d.hpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions ffi/capi/bindings/cpp/icu4x/Locale.hpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 6 additions & 8 deletions ffi/capi/bindings/dart/Locale.g.dart

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion ffi/capi/bindings/demo_gen/Locale.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions ffi/capi/bindings/demo_gen/Locale.mjs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions ffi/capi/bindings/demo_gen/index.mjs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion ffi/capi/bindings/js/Locale.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions ffi/capi/bindings/js/Locale.mjs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 5 additions & 7 deletions ffi/capi/src/locale_core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,16 +134,14 @@ pub mod ffi {
Ok(())
}

/// Best effort locale canonicalizer that doesn't need any data
///
/// Use LocaleCanonicalizer for better control and functionality
#[diplomat::rust_link(icu::locale::Locale::canonicalize, FnInStruct)]
#[diplomat::rust_link(icu::locale::Locale::canonicalize_utf8, FnInStruct, hidden)]
pub fn canonicalize(
/// Normalizes a locale string.
#[diplomat::rust_link(icu::locale::Locale::normalize, FnInStruct)]
#[diplomat::rust_link(icu::locale::Locale::normalize_utf8, FnInStruct, hidden)]
pub fn normalize(
s: &DiplomatStr,
write: &mut DiplomatWrite,
) -> Result<(), LocaleParseError> {
let _infallible = icu_locale_core::Locale::canonicalize_utf8(s)?.write_to(write);
let _infallible = icu_locale_core::Locale::normalize_utf8(s)?.write_to(write);
Ok(())
}
/// Returns a string representation of [`Locale`].
Expand Down

0 comments on commit 3ba709c

Please sign in to comment.