Skip to content

Commit

Permalink
Adapt the collator to Unicode 16 normalization changes (#4878)
Browse files Browse the repository at this point in the history
This is the collator counterpart of #4860.

Co-authored-by: Robert Bastian <[email protected]>
  • Loading branch information
hsivonen and robertbastian authored Oct 28, 2024
1 parent c3da0ca commit 7cc22e8
Showing 1 changed file with 21 additions and 7 deletions.
28 changes: 21 additions & 7 deletions components/collator/src/elements.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,19 @@ use crate::provider::CollationDataV1;

/// Marker that a complex decomposition isn't round-trippable
/// under re-composition.
const NON_ROUND_TRIP_MARKER: u16 = 1;
///
/// TODO: When taking a data format break, swap this and
/// `BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER` around
/// to make backward-combiningness use the same bit in all
/// cases.
const NON_ROUND_TRIP_MARKER: u16 = 0b1;

/// Marker that a complex decomposition starts with a starter
/// that can combine backwards.
const BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER: u16 = 0b10;

/// Values above this are treated as a BMP character.
const HIGHEST_MARKER: u16 = NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER;

/// Marker value for U+FDFA in NFKD
const FDFA_MARKER: u16 = 3;
Expand Down Expand Up @@ -1038,7 +1050,7 @@ where
} else {
let trail_or_complex = (decomposition >> 16) as u16;
let lead = decomposition as u16;
if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
if lead > HIGHEST_MARKER && trail_or_complex != 0 {
// Decomposition into two BMP characters: starter and non-starter
self.upcoming.push(
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(char_from_u16(
Expand All @@ -1052,7 +1064,7 @@ where
low_c, trie_value,
),
);
} else if lead > NON_ROUND_TRIP_MARKER {
} else if trail_or_complex == 0 {
debug_assert_ne!(
lead, FDFA_MARKER,
"How come U+FDFA NFKD marker seen in NFD?"
Expand Down Expand Up @@ -1334,7 +1346,7 @@ where
} else {
let trail_or_complex = (decomposition >> 16) as u16;
let lead = decomposition as u16;
if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
if lead > HIGHEST_MARKER && trail_or_complex != 0 {
// Decomposition into two BMP characters: starter and non-starter
c = char_from_u16(lead);
ce32 = data.ce32_for_char(c);
Expand Down Expand Up @@ -1397,9 +1409,11 @@ where
}
combining_characters
.push(CharacterAndClass::new_with_placeholder(combining));
} else if lead > NON_ROUND_TRIP_MARKER {
debug_assert_ne!(lead, 1, "How come U+FDFA NFKD marker seen in NFD?");
debug_assert_ne!(lead, 2, "How come non-starter marker seen here?");
} else if trail_or_complex == 0 {
debug_assert_ne!(
lead, FDFA_MARKER,
"How come U+FDFA NFKD marker seen in NFD?"
);
// Decomposition into one BMP character
c = char_from_u16(lead);
ce32 = data.ce32_for_char(c);
Expand Down

0 comments on commit 7cc22e8

Please sign in to comment.