Skip to content

Commit

Permalink
Allow all alphabetic and numeric characters in words
Browse files Browse the repository at this point in the history
Ensures that the new rules are strictly more permissive than the old ones.
  • Loading branch information
Jules-Bertholet committed Mar 15, 2024
1 parent 30ea379 commit ce59241
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 93 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ The definition of a word boundary is based on the
in Unicode Technical Standard 55. The rules are as follows:

- The set of characters that can be in a word is
[`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Unassigned}\p{Private_Use}-[\p{Punctuation}-\p{Other_Punctuation}]]`][1],
[`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1],
plus U+05F3, U+05F4, and U+0F0B. This notably includes
alphabetic and numeric characters, accents and other combining marks,
emoji, a few mathematical symbols, a few non-word-separating punctuation marks,
unassigned characters, private-use characters, and the asterisk `*`.
unassigned characters, and private-use characters.

- Characters that cannot be in a word separate words.
For example, `foo_bar` is segmented `foo`|`bar`
Expand All @@ -43,7 +43,7 @@ in Unicode Technical Standard 55. The rules are as follows:
by some number of nonspacing marks (like accents or other diacritics)
is treated as if it was the letter alone. For example, `áB` is segmented ``|`B`.

[1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BUnassigned%7D%5Cp%7BPrivate_Use%7D-%5B%5Cp%7BPunctuation%7D-%5Cp%7BOther_Punctuation%7D%5D%5D&abb=on
[1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i=

## Cases contained in this library:

Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
//! in Unicode Technical Standard 55. The rules are as follows:
//!
//! - The set of characters that can be in a word is
//! [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Unassigned}\p{Private_Use}-[\p{Punctuation}-\p{Other_Punctuation}]]`][1],
//! [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1],
//! plus U+05F3, U+05F4, and U+0F0B. This notably includes
//! alphabetic and numeric characters, accents and other combining marks,
//! emoji, a few mathematical symbols, a few non-word-separating punctuation marks,
Expand Down Expand Up @@ -41,7 +41,7 @@
//! by some number of nonspacing marks (like accents or other diacritics)
//! is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`.
//!
//! [1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BUnassigned%7D%5Cp%7BPrivate_Use%7D-%5B%5Cp%7BPunctuation%7D-%5Cp%7BOther_Punctuation%7D%5D%5D&abb=on
//! [1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i=
//!
//! ### Cases contained in this library:
//!
Expand Down
161 changes: 75 additions & 86 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [
0x2F, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x31, 0x25, 0x00, 0x32, 0x00, 0x00, 0x33, 0x00, 0x34,
0x35, 0x36, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x39,
0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
0x43, 0x44, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x45, 0x46, 0x41,
0x00, 0x00, 0x00, 0x47, 0x00, 0x48, 0x00, 0x00, 0x41, 0x49, 0x4A, 0x4B, 0x41, 0x41, 0x41, 0x4C,
0x4D, 0x00, 0x4E, 0x00, 0x00, 0x00, 0x4F, 0x50, 0x51, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
0x43, 0x44, 0x45, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x46, 0x47, 0x41,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x48, 0x49, 0x41,
0x00, 0x00, 0x00, 0x4A, 0x00, 0x19, 0x00, 0x00, 0x4B, 0x4C, 0x4D, 0x4E, 0x41, 0x41, 0x41, 0x4F,
0x50, 0x00, 0x51, 0x00, 0x00, 0x00, 0x52, 0x53, 0x54, 0x55, 0x56, 0x41, 0x41, 0x41, 0x41, 0x41,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
Expand Down Expand Up @@ -63,8 +63,8 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x52, 0x53, 0x00, 0x00, 0x00, 0x00, 0x54, 0x55, 0x00, 0x56, 0x57, 0x00, 0x58, 0x00,
0x59, 0x5A, 0x00, 0x5B, 0x5C, 0x5D, 0x00, 0x5E, 0x00, 0x5F, 0x00, 0x60, 0x00, 0x61, 0x00, 0x62,
0x00, 0x00, 0x57, 0x58, 0x00, 0x00, 0x00, 0x00, 0x59, 0x5A, 0x00, 0x5B, 0x5C, 0x00, 0x5D, 0x00,
0x5E, 0x5F, 0x00, 0x60, 0x61, 0x62, 0x00, 0x63, 0x00, 0x64, 0x00, 0x65, 0x00, 0x66, 0x00, 0x67,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
Expand All @@ -84,22 +84,22 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x00, 0x64, 0x65,
0x00, 0x00, 0x00, 0x00, 0x36, 0x66, 0x00, 0x67, 0x68, 0x69, 0x00, 0x1F, 0x6A, 0x6B, 0x00, 0x6C,
0x00, 0x00, 0x00, 0x00, 0x6D, 0x6E, 0x6F, 0x70, 0x00, 0x00, 0x00, 0x71, 0x72, 0x00, 0x5D, 0x73,
0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x75, 0x76, 0x35, 0x77, 0x00, 0x78, 0x79, 0x00, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x00,
0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x82, 0x00, 0x83, 0x84, 0x85, 0x86,
0x00, 0x87, 0x35, 0x88, 0x00, 0x89, 0x00, 0x8A, 0x8B, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x8C, 0x00, 0x8D, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x8F, 0x90, 0x00, 0x80, 0x00, 0x00, 0x00,
0x26, 0x00, 0x00, 0x91, 0x00, 0x92, 0x00, 0x15, 0x1F, 0x93, 0x94, 0x00, 0x95, 0x00, 0x00, 0x00,
0x00, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x00, 0x98, 0x00, 0x99,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x68, 0x00, 0x69, 0x6A,
0x00, 0x00, 0x00, 0x00, 0x36, 0x6B, 0x00, 0x6C, 0x6D, 0x6E, 0x00, 0x1F, 0x6F, 0x70, 0x00, 0x71,
0x00, 0x00, 0x00, 0x00, 0x72, 0x73, 0x74, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x76,
0x00, 0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x78, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x7B, 0x73, 0x00, 0x7C, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x7E, 0x7F, 0x00,
0x00, 0x80, 0x35, 0x81, 0x00, 0x82, 0x00, 0x83, 0x84, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x87, 0x00, 0x88, 0x89, 0x00, 0x34, 0x00, 0x00, 0x00,
0x26, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x15, 0x1F, 0x8B, 0x8C, 0x00, 0x8D, 0x00, 0x00, 0x00,
0x00, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x90, 0x00, 0x91,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
Expand All @@ -112,8 +112,8 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x9D, 0x1B, 0x9E, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x15,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x95, 0x96, 0x07, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x15,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
Expand All @@ -133,29 +133,29 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0xA0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x52, 0x41, 0xA1,
0x41, 0x41, 0x41, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x41, 0xA7, 0x00, 0xA8, 0x41, 0xA9, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x57, 0x41, 0x99,
0x41, 0x41, 0x41, 0x46, 0x9A, 0x9B, 0x9C, 0x9D, 0x41, 0x9E, 0x00, 0x00, 0x41, 0x9F, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xAA, 0xAB, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xA0, 0xA1, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0xAE, 0x00, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0xAF, 0xB0, 0x00, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00,
0xB2, 0x41, 0xB3, 0xB4, 0x41, 0x41, 0xB5, 0xB6, 0xB7, 0xB8, 0x00, 0x00, 0x41, 0x41, 0x41, 0x41,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xB9, 0x41, 0xBA, 0x41, 0xBB,
0xBC, 0xBD, 0xBE, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0xBF, 0xC0, 0xC1, 0x41, 0x41, 0xC2, 0xC3,
0x00, 0x00, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0xA4, 0x00, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00,
0xA6, 0x41, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0x00, 0x00, 0x41, 0x41, 0x41, 0x41,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xAF, 0x41, 0xB0, 0x41, 0xB1,
0xB2, 0xB3, 0xB4, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0xB5, 0xB6, 0xB7, 0x41, 0x41, 0xB8, 0x44,
];

static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [
static ALLOWED_IN_WORD_LEAVES: [u64; 185] = [
0xFFFFFFFFFFFFFFFF,
0x03FF000000000000,
0x07FFFFFE07FFFFFE,
0x06AC040000000000,
0x76AC040000000000,
0xFF7FFFFFFF7FFFFF,
0x0000501F0003FFC3,
0xBFDFFFFFFFFFFFFF,
Expand All @@ -175,30 +175,30 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [
0xFFFFFFFFFFFCFEFF,
0xFFFFFFFBFFFFFFFF,
0xFFFEFFCFFFFFFFFF,
0xD003FFFFFFFFFFFF,
0xD3F3FFFFFFFFFFFF,
0xFFFCFFFFFFFFFFFF,
0xFF02FFFFFFFFFFFF,
0xF800FFFFFFFFFFFF,
0x007FFFFFFFFFFFFF,
0xFFFEFFFFFFFFFFFF,
0xF807FFFFFFFFFFFF,
0x7F7FFFFFFFFFFFFF,
0xFFFFFFFFFFFFFFEF,
0xFC00FFFF80FF7FFF,
0xFDFFFFFFFFFF7FFF,
0xFFEFFFFFFFFFFFFF,
0x7FFFFFFFFFFFFFFF,
0xFFFFFFFFF3FF7FFF,
0xC2A003FF03000801,
0xC2AFFFFF03000801,
0x3FFFFFFFFFFFFFDF,
0xFFFFFFFFF8002040,
0xFFFFFFFFFFFF03FF,
0xFFFFFFFF3FFFFFFF,
0xF7FFFFFFFFFFFFFF,
0xE003FE00FFFFFFFF,
0xFFFFFE00FFFFFFFF,
0xFFFFFFFFFC00FFFF,
0xFFFFFFFFFFFFFFFE,
0xFFFF9FFFFFFFFFFF,
0xFFFFFFFFE7FFFFFE,
0xFFFFC7FFFFFFFFFF,
0xFF9FFFFFFFFFFFFF,
0xFC00FFFFF08FFFFF,
0xFFFFFFFFF08FFFFF,
0xFFFFFFFFFFFFB800,
0xFFFFFFFFFFFFFFCE,
0x000000003FFFFFFF,
Expand All @@ -215,33 +215,38 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [
0x00000000FFFF9FFF,
0xFFFFFFE21FFFFFFE,
0xF3FFFD503F2FFC84,
0xFFFFFFFF000043E0,
0x000000000000F1FF,
0xFFFFFFFFFFFF43E0,
0x000000000000F3FF,
0x0000000000000000,
0x0000000040000084,
0xFFFFFF8000000000,
0x00000000FFFFF800,
0xFFFFFFFFFFFFF800,
0xFFC000000FFFFFFF,
0xFFC0000000000000,
0x00000000000FFFFF,
0x0030000000000000,
0x0000000000400000,
0x01FFF81FFFFFFFFF,
0xFFFEFFFFFFFFFFFF,
0x21FFF81FFFFFFFFF,
0x0000800000000000,
0xFFFFFFFFC0000000,
0x0000000004000000,
0xFFF0000000000000,
0x0000FFFFFFC00000,
0x1F3EFFFE000000E0,
0xFFFFFFFEFFFFFFFF,
0xFFFFFFFF0000FFFF,
0xFFFFFFFF003CFFFF,
0xFFFF7FF000000000,
0x0000000080000000,
0x000003FF80000000,
0x00000000FFFEFF00,
0xFFFE0000000003FF,
0x000000000000FFFF,
0x3FFFFFFFFFFFFF80,
0xFFFFFFFFFFFF1FFF,
0xBFF0FFFFFFFFFFFF,
0xFF03FFFFFFFFFFFF,
0xFFFFFFFCFF800000,
0xFFFFFFFFFFFFF9FF,
0xFC00F0FFFFFFFFFF,
0xFC3FF0FFFFFFFFFF,
0xFF0FFFFFFFFFFFFF,
0xE8FFFFFFFFFF3FFF,
0xFFFF3FFFFFFFFFFF,
Expand All @@ -261,79 +266,64 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [
0x07FFFFFE03FF0001,
0xFFFFFFE007FFFFFE,
0xC1FF8080FFFFFFFF,
0x0070000000000078,
0x001FFFFFFFFFFFFF,
0xFFFFFFFEE0008000,
0x007FFFFFFFFFFFF8,
0x01FFFFFFFFFFFFFF,
0xFFFFFFFEE0008C00,
0xE00000000000FFFF,
0xF0000001FFFFFFFF,
0xFFFFFFF0FFFFFFFF,
0xFFFFFFFFFFFEFFFF,
0xFFFF7FFFFFFFFFFF,
0x007FFFFF007FFFFF,
0xFFFF007FFFFFFFFF,
0x7FFFFFFF703FFFFF,
0xCFFFFFFFFFFFFFFF,
0x0000000000030000,
0x1FFFFFFFFE00FE00,
0xFFFFFFFF1FFFFFFF,
0xFF8007FFFFFFFEFF,
0x01FFFFFFFFFFFFFF,
0x00FFFFFF00FFFFFF,
0xFFFF01FFE1FFFFFF,
0x03FFFFFFFFFFFFFF,
0x80000000FFFFFFFF,
0xFE7FFFFFFF7FFFFF,
0x7FFFFFFF7FFFFFFF,
0x7FFFFFFFFE00FFFF,
0xFF80FFFFFFFFFEFF,
0xFFFFFFFFE1FFFFFF,
0xFFFFDFFFFFFFFFFF,
0xFFFFFF801FFFFFFF,
0xFFFFFFFFFC01FFFF,
0xFFFFFFFFFC1FFFFF,
0xFFFFFFFFFFFFFC3F,
0xFFFFFFFFFFFFF01F,
0xFFFFFFC00003C07F,
0xFFFFFFFFFFFFC07F,
0xFFFFFFFFFFFFDFFC,
0xFFCFFFFFFFFFFFF0,
0xFFE0000117FFDE1F,
0xFFFFFFFF17FFDE1F,
0xC0FFFFFFFFFFFFFF,
0xFFFFFFFFD3FF07FF,
0xFFFFFFFFFFFFFFBF,
0xFFFFFFFFFF000001,
0xFFFFE000FFFFFFF1,
0xFDFFFFFFFFFFFFFF,
0xFFF803FFFFFFFFFF,
0xFFFFFFFFFFFFFF8F,
0xFFFFFFFFFFFFFF80,
0xFFFFFFF823FFFFFF,
0xFFFFFFFFFFFFFC00,
0xFFFCE00003FFFFC1,
0xFFFCFFFFFFFFFFC1,
0xFE7FFFFFFFFFFFFF,
0xFFFFFFFFFFFF0007,
0x7FFC000000000000,
0x7FFC0000001FFFFF,
0xFFE0FFFFFFFFFFFF,
0xFFF9FFFFFFFFFFFF,
0x0000FFFFFFFFFFFF,
0xFFDFFFFFFFFFFFFF,
0xFFFFFFFC07FFFFCF,
0xFFFFFFFFF8000000,
0x007FFFFFFFFFFFFF,
0xFFFFFFFFF87FFFFF,
0xFFFFFFF06FFFFFFF,
0xFFFFFFFFFFFFFFF0,
0xFFC0000000000000,
0x0000018000000000,
0xF807E3E000000000,
0x00003C0000000FE7,
0xFFFFF80000000000,
0xFFFFFFFFFFFFFFDC,
0xFFF00000FFF00000,
0xFE000000FF800000,
0xFFFFFFFFFF800000,
0xF87FFFFFFFFFFFFF,
0x00201FFFFFFFFFFF,
0xFFFFFFFFFFFFF010,
0xFFFFFFFFFFFF7FFF,
0xFFFFFFFFFFFF007F,
0x0001FFFFFFFFFFFF,
0xFFE0000000000000,
0xC000000000000001,
0xFFFEEFFFFFFFFFFF,
0xFFFFBFFFFFFFFFFF,
0x0000F00000000000,
0x00018000FFF00000,
0xFFC0000000010001,
0xFFFFC00000000000,
0xFFFF000000001FFF,
0xFFFF03FFFFFF03FF,
0xFFFFC000000003FF,
0x0000003FFFFFFFFF,
0xF00000000000FFF8,
0xFFFFFFC0FFFCFE00,
Expand All @@ -347,7 +337,6 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [
0x400000000000FE00,
0xFE00FE00F0003FC0,
0x0000000000080000,
0xFFFFFFFFFFFFF800,
];

#[cfg(test)]
Expand Down
6 changes: 4 additions & 2 deletions tables/src/allowed_in_word.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,20 @@ fn unassigned_private_use(data: &DataFiles) -> CodepointBitArr {
}

/// `true` for all codepoints that can be part of a word:
/// `[\p{Unassigned}\p{Private_Use}\p{ID_Continue}\p{ID_Compat_Math_Continue}-[\p{Punctuation}-\p{Other_Punctuation}]]`,
/// `[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`,
/// plus the extra characters listed below.
pub fn allowed_in_word(data: &DataFiles) -> CodepointBitArr {
let mut word_component = unassigned_private_use(data);

set_by_prop(
&mut word_component,
&data.derived_core_properties,
"ID_Continue",
"ID_Continue|Alphabetic",
true,
);

set_by_general_category(&mut word_component, data, "Nd|Nl|No", true);

set_by_prop(
&mut word_component,
&data.prop_list,
Expand Down

0 comments on commit ce59241

Please sign in to comment.