Skip to content

Commit

Permalink
ucd-util: fix canonicalization of 'isc'
Browse files Browse the repository at this point in the history
This commit fixes a bug where 'isc' was canonicalized to 'c'. 'isc' is an
alias for 'ISO_Comment', but the 'is' prefix was being dropped since
canonicalization permits ignoring 'is' prefixes when designating property
names.

This is the root cause of a bug in the regex library:
rust-lang/regex#466
  • Loading branch information
BurntSushi committed Apr 28, 2018
1 parent 7a5cd62 commit 523bd20
Showing 1 changed file with 15 additions and 1 deletion.
16 changes: 15 additions & 1 deletion ucd-util/src/name.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
// names/aliases had a particular structure (unlike character names), but
// we assume that it's ASCII only and drop anything that isn't ASCII.
let mut start = 0;
let mut starts_with_is = false;
if slice.len() >= 2 {
// Ignore any "is" prefix.
let starts_with_is =
starts_with_is =
slice[0..2] == b"is"[..]
|| slice[0..2] == b"IS"[..]
|| slice[0..2] == b"iS"[..]
Expand All @@ -121,6 +122,16 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
next_write += 1;
}
}
// Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
// ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
// fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
// is actually an alias for the 'Other' general category.
if starts_with_is && next_write == 1 && slice[0] == b'c' {
slice[0] = b'i';
slice[1] = b's';
slice[2] = b'c';
next_write = 3;
}
&mut slice[..next_write]
}

Expand Down Expand Up @@ -162,6 +173,9 @@ mod tests {
assert_eq!(sym_norm("Greek"), "greek");
assert_eq!(sym_norm("isGreek"), "greek");
assert_eq!(sym_norm("IS_Greek"), "greek");
assert_eq!(sym_norm("isc"), "isc");
assert_eq!(sym_norm("is c"), "isc");
assert_eq!(sym_norm("is_c"), "isc");
}

#[test]
Expand Down

0 comments on commit 523bd20

Please sign in to comment.