Locale canonicalize -> normalize (#5766)

Fixes #2748
unicode-org · Nov 3, 2024 · 3ba709c · 3ba709c
1 parent 983ce33
commit 3ba709c
Show file tree

Hide file tree

Showing 17 changed files with 58 additions and 72 deletions.
diff --git a/components/locale_core/benches/helpers/macros.rs b/components/locale_core/benches/helpers/macros.rs
@@ -102,7 +102,7 @@ macro_rules! canonicalize {
                     let _ = black_box(s).to_string();
                 }
                 for s in $data {
-                    let _ = $struct::canonicalize(black_box(s));
+                    let _ = $struct::normalize(black_box(s));
                 }
             })
         });

diff --git a/components/locale_core/benches/iai_langid.rs b/components/locale_core/benches/iai_langid.rs
@@ -108,7 +108,7 @@ fn bench_langid_canonicalize() {
 
     let _: Vec<Cow<str>> = LIDS_STR
         .iter()
-        .map(|l| LanguageIdentifier::canonicalize(l).expect("Canonicalization failed"))
+        .map(|l| LanguageIdentifier::normalize(l).expect("Normalization failed"))
         .collect();
 }
 

diff --git a/components/locale_core/examples/syntatically_canonicalize_locales.rs b/components/locale_core/examples/syntatically_canonicalize_locales.rs
@@ -23,7 +23,7 @@ fn main() {
         .split(',')
         .map(str::trim)
     {
-        let output = Locale::canonicalize(input).unwrap();
+        let output = Locale::normalize(input).unwrap();
         println!("{input} -> {output}");
     }
 }
diff --git a/components/locale_core/src/langid.rs b/components/locale_core/src/langid.rs
@@ -167,47 +167,41 @@ impl LanguageIdentifier {
             && self.variants.is_empty()
     }
 
-    /// Canonicalize the language identifier (operating on UTF-8 formatted byte slices)
+    /// Normalize the language identifier (operating on UTF-8 formatted byte slices)
     ///
-    /// This is a best-effort operation that performs all available levels of canonicalization.
-    ///
-    /// At the moment the operation will normalize casing and the separator, but in the future
-    /// it may also validate and update from deprecated subtags to canonical ones.
+    /// This operation will normalize casing and the separator.
     ///
     /// # Examples
     ///
     /// ```
     /// use icu::locale::LanguageIdentifier;
     ///
     /// assert_eq!(
-    ///     LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
+    ///     LanguageIdentifier::normalize("pL_latn_pl").as_deref(),
     ///     Ok("pl-Latn-PL")
     /// );
     /// ```
-    pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
+    pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
         let lang_id = Self::try_from_utf8(input)?;
         Ok(writeable::to_string_or_borrow(&lang_id, input))
     }
 
-    /// Canonicalize the language identifier (operating on strings)
-    ///
-    /// This is a best-effort operation that performs all available levels of canonicalization.
+    /// Normalize the language identifier (operating on strings)
     ///
-    /// At the moment the operation will normalize casing and the separator, but in the future
-    /// it may also validate and update from deprecated subtags to canonical ones.
+    /// This operation will normalize casing and the separator.
     ///
     /// # Examples
     ///
     /// ```
     /// use icu::locale::LanguageIdentifier;
     ///
     /// assert_eq!(
-    ///     LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
+    ///     LanguageIdentifier::normalize("pL_latn_pl").as_deref(),
     ///     Ok("pl-Latn-PL")
     /// );
     /// ```
-    pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
-        Self::canonicalize_utf8(input.as_bytes())
+    pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
+        Self::normalize_utf8(input.as_bytes())
     }
 
     /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
@@ -371,14 +365,14 @@ impl LanguageIdentifier {
     /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
     /// lowercase ascii form.
     ///
-    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
+    /// The default normalization of language identifiers uses titlecase scripts and uppercase
     /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
     ///
     /// > _The canonical form for all subtags in the extension is lowercase, with the fields
     /// > ordered by the separators, alphabetically._
     ///
     /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
-    /// canonicalization of the language identifier.
+    /// normalization of the language identifier.
     ///
     /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
     /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
@@ -406,14 +400,14 @@ impl LanguageIdentifier {
     /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
     /// lowercase ascii chars.
     ///
-    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
+    /// The default normalization of language identifiers uses titlecase scripts and uppercase
     /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
     ///
     /// > _The canonical form for all subtags in the extension is lowercase, with the fields
     /// > ordered by the separators, alphabetically._
     ///
     /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
-    /// canonicalization of the language identifier.
+    /// normalization of the language identifier.
     ///
     /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
     /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,

diff --git a/components/locale_core/src/locale.rs b/components/locale_core/src/locale.rs
@@ -151,45 +151,41 @@ impl Locale {
         }
     }
 
-    /// Canonicalize the locale (operating on UTF-8 formatted byte slices)
+    /// Normalize the locale (operating on UTF-8 formatted byte slices)
     ///
-    /// This is a best-effort operation that performs all available levels of canonicalization.
-    ///
-    /// At the moment the operation will normalize casing and the separator, but in the future
-    /// it may also validate and update from deprecated subtags to canonical ones.
+    /// This operation will normalize casing and the separator.
     ///
     /// # Examples
     ///
     /// ```
     /// use icu::locale::Locale;
     ///
     /// assert_eq!(
-    ///     Locale::canonicalize_utf8(b"pL_latn_pl-U-HC-H12").as_deref(),
+    ///     Locale::normalize_utf8(b"pL_latn_pl-U-HC-H12").as_deref(),
     ///     Ok("pl-Latn-PL-u-hc-h12")
     /// );
     /// ```
-    pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
+    pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
         let locale = Self::try_from_utf8(input)?;
         Ok(writeable::to_string_or_borrow(&locale, input))
     }
 
-    /// Canonicalize the locale (operating on strings)
+    /// Normalize the locale (operating on strings)
     ///
-    /// At the moment the operation will normalize casing and the separator, but in the future
-    /// it may also validate and update from deprecated subtags to canonical ones.
+    /// This operation will normalize casing and the separator.
     ///
     /// # Examples
     ///
     /// ```
     /// use icu::locale::Locale;
     ///
     /// assert_eq!(
-    ///     Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
+    ///     Locale::normalize("pL_latn_pl-U-HC-H12").as_deref(),
     ///     Ok("pl-Latn-PL-u-hc-h12")
     /// );
     /// ```
-    pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
-        Self::canonicalize_utf8(input.as_bytes())
+    pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
+        Self::normalize_utf8(input.as_bytes())
     }
 
     /// Compare this [`Locale`] with BCP-47 bytes.

diff --git a/components/locale_core/src/macros.rs b/components/locale_core/src/macros.rs
@@ -4,7 +4,7 @@
 
 /// A macro allowing for compile-time construction of valid [`LanguageIdentifier`]s.
 ///
-/// The macro will perform syntax canonicalization of the tag.
+/// The macro will perform syntax normalization of the tag.
 ///
 /// # Examples
 ///
@@ -57,7 +57,7 @@ macro_rules! langid {
 
 /// A macro allowing for compile-time construction of valid [`Locale`]s.
 ///
-/// The macro will perform syntax canonicalization of the tag.
+/// The macro will perform syntax normalization of the tag.
 ///
 /// # Examples
 ///

diff --git a/components/locale_core/src/subtags/mod.rs b/components/locale_core/src/subtags/mod.rs
@@ -12,13 +12,13 @@
 //! * [`Variants`] is a list of optional [`Variant`] subtags containing information about the
 //!                variant adjustments used by the locale.
 //!
-//! Subtags can be used in isolation, and all basic operations such as parsing, syntax canonicalization
+//! Subtags can be used in isolation, and all basic operations such as parsing, syntax normalization
 //! and serialization are supported on each individual subtag, but most commonly
 //! they are used to construct a [`LanguageIdentifier`] instance.
 //!
 //! [`Variants`] is a special structure which contains a list of [`Variant`] subtags.
 //! It is wrapped around to allow for sorting and deduplication of variants, which
-//! is one of the required steps of language identifier and locale syntax canonicalization.
+//! is one of the required steps of language identifier and locale syntax normalization.
 //!
 //! # Examples
 //!
@@ -40,8 +40,8 @@
 //! assert_eq!(variant.as_str(), "macos");
 //! ```
 //!
-//! `Notice`: The subtags are canonicalized on parsing. That means
-//! that all operations work on a canonicalized version of the subtag
+//! `Notice`: The subtags are normalized on parsing. That means
+//! that all operations work on a normalized version of the subtag
 //! and serialization is very cheap.
 //!
 //! [`LanguageIdentifier`]: super::LanguageIdentifier

diff --git a/ffi/capi/bindings/c/Locale.h b/ffi/capi/bindings/c/Locale.h
diff --git a/ffi/capi/bindings/cpp/icu4x/Locale.d.hpp b/ffi/capi/bindings/cpp/icu4x/Locale.d.hpp
diff --git a/ffi/capi/bindings/cpp/icu4x/Locale.hpp b/ffi/capi/bindings/cpp/icu4x/Locale.hpp
diff --git a/ffi/capi/bindings/dart/Locale.g.dart b/ffi/capi/bindings/dart/Locale.g.dart
diff --git a/ffi/capi/bindings/demo_gen/Locale.d.ts b/ffi/capi/bindings/demo_gen/Locale.d.ts
diff --git a/ffi/capi/bindings/demo_gen/Locale.mjs b/ffi/capi/bindings/demo_gen/Locale.mjs
diff --git a/ffi/capi/bindings/demo_gen/index.mjs b/ffi/capi/bindings/demo_gen/index.mjs
diff --git a/ffi/capi/bindings/js/Locale.d.ts b/ffi/capi/bindings/js/Locale.d.ts
diff --git a/ffi/capi/bindings/js/Locale.mjs b/ffi/capi/bindings/js/Locale.mjs
diff --git a/ffi/capi/src/locale_core.rs b/ffi/capi/src/locale_core.rs
@@ -134,16 +134,14 @@ pub mod ffi {
             Ok(())
         }
 
-        /// Best effort locale canonicalizer that doesn't need any data
-        ///
-        /// Use LocaleCanonicalizer for better control and functionality
-        #[diplomat::rust_link(icu::locale::Locale::canonicalize, FnInStruct)]
-        #[diplomat::rust_link(icu::locale::Locale::canonicalize_utf8, FnInStruct, hidden)]
-        pub fn canonicalize(
+        /// Normalizes a locale string.
+        #[diplomat::rust_link(icu::locale::Locale::normalize, FnInStruct)]
+        #[diplomat::rust_link(icu::locale::Locale::normalize_utf8, FnInStruct, hidden)]
+        pub fn normalize(
             s: &DiplomatStr,
             write: &mut DiplomatWrite,
         ) -> Result<(), LocaleParseError> {
-            let _infallible = icu_locale_core::Locale::canonicalize_utf8(s)?.write_to(write);
+            let _infallible = icu_locale_core::Locale::normalize_utf8(s)?.write_to(write);
             Ok(())
         }
         /// Returns a string representation of [`Locale`].