diff --git a/components/normalizer/src/lib.rs b/components/normalizer/src/lib.rs index c95a5a786bb..96bd65c93f7 100644 --- a/components/normalizer/src/lib.rs +++ b/components/normalizer/src/lib.rs @@ -179,7 +179,19 @@ const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16: u16 = 2; /// Marker that a complex decomposition isn't round-trippable /// under re-composition. -const NON_ROUND_TRIP_MARKER: u16 = 1; +/// +/// TODO: When taking a data format break, swap this and +/// `BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER` around +/// to make backward-combiningness use the same bit in all +/// cases. +const NON_ROUND_TRIP_MARKER: u16 = 0b1; + +/// Marker that a complex decomposition starts with a starter +/// that can combine backwards. +const BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER: u16 = 0b10; + +/// Values above this are treated as a BMP character. +const HIGHEST_MARKER: u16 = NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER; /// Checks if a trie value carries a (non-zero) canonical /// combining class. @@ -398,6 +410,7 @@ impl CharacterAndTrieValue { pub fn can_combine_backwards(&self) -> bool { decomposition_starts_with_non_starter(self.trie_val) || self.trie_val == BACKWARD_COMBINING_STARTER_MARKER + || (((self.trie_val as u16) & !1) == BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER && (self.trie_val >> 16) != 0) // Combine with the previous condition when taking a data format break || in_inclusive_range32(self.trie_val, 0x1161, 0x11C2) } #[inline(always)] @@ -426,7 +439,7 @@ impl CharacterAndTrieValue { if lead == 0 { return true; } - if lead == NON_ROUND_TRIP_MARKER { + if lead <= HIGHEST_MARKER { return false; } if (trail_or_complex & 0x7F) == 0x3C @@ -830,14 +843,14 @@ where } else { let trail_or_complex = (decomposition >> 16) as u16; let lead = decomposition as u16; - if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 { + if lead > HIGHEST_MARKER && trail_or_complex != 0 { // Decomposition into two BMP characters: starter and non-starter let starter = char_from_u16(lead); let combining = char_from_u16(trail_or_complex); self.buffer .push(CharacterAndClass::new_with_placeholder(combining)); (starter, 0) - } else if lead > NON_ROUND_TRIP_MARKER { + } else if trail_or_complex == 0 { if lead != FDFA_MARKER { debug_assert_ne!( lead, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16, diff --git a/components/normalizer/src/properties.rs b/components/normalizer/src/properties.rs index 58878ae5486..88c06ddf0e2 100644 --- a/components/normalizer/src/properties.rs +++ b/components/normalizer/src/properties.rs @@ -396,17 +396,7 @@ impl CanonicalDecompositionBorrowed<'_> { let offset24 = offset - tables.scalars16.len(); if let Some(first_c) = tables.scalars24.get(offset24) { if len == 1 { - if c != first_c { - return Decomposed::Singleton(first_c); - } else { - // Singleton representation used to avoid - // NFC passthrough of characters that combine - // with starters that can occur as the first - // character of an expansion decomposition. - // See section 5 of - // https://www.unicode.org/L2/L2024/24009-utc178-properties-recs.pdf - return Decomposed::Default; - } + return Decomposed::Singleton(first_c); } if let Some(second_c) = tables.scalars24.get(offset24 + 1) { return Decomposed::Expansion(first_c, second_c);