From b8bed5908d62d7ced790cf6b711da6998a0a3da4 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 14 Mar 2024 20:28:22 -0400 Subject: [PATCH 1/5] Expand the set of characters allowed in words Before this commit, words could only contain alphabetic and numeric characters. This is unduly limiting; for example, combining marks (like accents) are not included. This commit expands the set of allowed characters based on the recommendations of UAX 31 , with a bias toward allowing more characters (though emoji are excluded). In addition, unassigned and private use characters are assumed to be allowed in words (ensuring that case conversion will pass them through unchanged). This change means we can no longer rely only on the Unicode data tables shipped with the standard library. A new `tables` binary crate is in charge of generating the tables we need (which consumes 3600 bytes of data). --- .github/workflows/rust.yml | 7 +- Cargo.toml | 3 + src/lib.rs | 4 +- src/tables.rs | 362 ++++++++++++++++++++++++++++++++++ tables/Cargo.toml | 12 ++ tables/src/allowed_in_word.rs | 219 ++++++++++++++++++++ tables/src/main.rs | 37 ++++ tables/src/unicode_data.rs | 93 +++++++++ 8 files changed, 734 insertions(+), 3 deletions(-) create mode 100644 src/tables.rs create mode 100644 tables/Cargo.toml create mode 100644 tables/src/allowed_in_word.rs create mode 100644 tables/src/main.rs create mode 100644 tables/src/unicode_data.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 33ec4eb..f11e809 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -33,13 +33,16 @@ jobs: uses: actions-rs/cargo@v1 with: command: test + args: --all - name: Check formatting uses: actions-rs/cargo@v1 with: command: fmt - args: -- --check + args: --all --check - name: Catch common mistakes uses: actions-rs/cargo@v1 with: command: clippy - args: --all-targets -- -D warnings + args: --all-targets --workspace -- -D warnings + - name: Regenerate Unicode tables + run: mv src/tables.rs tables.rs.bak && cd tables && cargo run && cd .. && diff src/tables.rs tables.rs.bak diff --git a/Cargo.toml b/Cargo.toml index a9fa2c3..adfea40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,6 @@ repository = "https://github.com/withoutboats/heck" keywords = ["string", "case", "camel", "snake", "unicode"] categories = ["no-std"] include = ["src/**/*", "LICENSE-*", "README.md", "CHANGELOG.md"] + +[workspace] +members = ["tables"] diff --git a/src/lib.rs b/src/lib.rs index ab8a015..4aea701 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,6 +47,7 @@ mod lower_camel; mod shouty_kebab; mod shouty_snake; mod snake; +mod tables; mod title; mod train; mod upper_camel; @@ -58,6 +59,7 @@ pub use shouty_snake::{ AsShoutySnakeCase, AsShoutySnakeCase as AsShoutySnekCase, ToShoutySnakeCase, ToShoutySnekCase, }; pub use snake::{AsSnakeCase, AsSnakeCase as AsSnekCase, ToSnakeCase, ToSnekCase}; +pub use tables::UNICODE_VERSION; pub use title::{AsTitleCase, ToTitleCase}; pub use train::{AsTrainCase, ToTrainCase}; pub use upper_camel::{ @@ -98,7 +100,7 @@ where let mut first_word = true; - for word in s.split(|c: char| !c.is_alphanumeric()) { + for word in s.split(|c: char| !tables::allowed_in_word(c)) { let mut char_indices = word.char_indices().peekable(); let mut init = 0; let mut mode = WordMode::Boundary; diff --git a/src/tables.rs b/src/tables.rs new file mode 100644 index 0000000..0daf24d --- /dev/null +++ b/src/tables.rs @@ -0,0 +1,362 @@ +//! Automatically generated by `tables`. +//! Do not edit manually. + +/// The version of Unicode that the data included with this crate is based on. +/// +/// This crate also relies on Unicode data provided by the standard library; +/// that data is versioned according to [`char::UNICODE_VERSION`]. +pub const UNICODE_VERSION: (u8, u8, u8) = (15, 1, 0); + +/// Whether this character can be part of a word. +pub fn allowed_in_word(c: char) -> bool { + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << 6); + let cp: u32 = c.into(); + let top_bits = cp >> 6; + if top_bits < 0x7F0 { + let leaf_idx: u8 = ALLOWED_IN_WORD_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = ALLOWED_IN_WORD_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + } else { + !matches!(cp, 0x0E0001 | 0x0E0020..=0x0E007F) + } +} + +static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [ + 0x01, 0x02, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x07, 0x08, + 0x00, 0x00, 0x09, 0x00, 0x00, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00, 0x0F, 0x10, 0x00, 0x00, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x00, 0x16, 0x00, 0x17, 0x00, 0x08, 0x00, 0x18, 0x00, 0x19, 0x00, 0x1A, + 0x00, 0x1B, 0x1C, 0x00, 0x00, 0x1D, 0x00, 0x1E, 0x1F, 0x20, 0x00, 0x00, 0x21, 0x00, 0x22, 0x23, + 0x00, 0x24, 0x25, 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x27, 0x28, 0x00, + 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x2B, 0x2C, 0x2D, 0x00, 0x00, 0x2E, + 0x2F, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x31, 0x25, 0x00, 0x32, 0x00, 0x00, 0x33, 0x00, 0x34, + 0x35, 0x36, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x39, + 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x43, 0x44, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x45, 0x46, 0x41, + 0x00, 0x00, 0x00, 0x47, 0x00, 0x48, 0x00, 0x00, 0x41, 0x49, 0x4A, 0x4B, 0x41, 0x41, 0x41, 0x4C, + 0x4D, 0x00, 0x4E, 0x00, 0x00, 0x00, 0x4F, 0x50, 0x51, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x52, 0x53, 0x00, 0x00, 0x00, 0x00, 0x54, 0x55, 0x00, 0x56, 0x57, 0x00, 0x58, 0x00, + 0x59, 0x5A, 0x00, 0x5B, 0x5C, 0x5D, 0x00, 0x5E, 0x00, 0x5F, 0x00, 0x60, 0x00, 0x61, 0x00, 0x62, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x00, 0x64, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x36, 0x66, 0x00, 0x67, 0x68, 0x69, 0x00, 0x1F, 0x6A, 0x6B, 0x00, 0x6C, + 0x00, 0x00, 0x00, 0x00, 0x6D, 0x6E, 0x6F, 0x70, 0x00, 0x00, 0x00, 0x71, 0x72, 0x00, 0x5D, 0x73, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x75, 0x76, 0x35, 0x77, 0x00, 0x78, 0x79, 0x00, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x00, + 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x82, 0x00, 0x83, 0x84, 0x85, 0x86, + 0x00, 0x87, 0x35, 0x88, 0x00, 0x89, 0x00, 0x8A, 0x8B, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x8C, 0x00, 0x8D, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x8F, 0x90, 0x00, 0x80, 0x00, 0x00, 0x00, + 0x26, 0x00, 0x00, 0x91, 0x00, 0x92, 0x00, 0x15, 0x1F, 0x93, 0x94, 0x00, 0x95, 0x00, 0x00, 0x00, + 0x00, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x00, 0x98, 0x00, 0x99, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x9D, 0x1B, 0x9E, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x15, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xA0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x52, 0x41, 0xA1, + 0x41, 0x41, 0x41, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x41, 0xA7, 0x00, 0xA8, 0x41, 0xA9, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xAA, 0xAB, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0xAE, 0x00, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0xAF, 0xB0, 0x00, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, + 0xB2, 0x41, 0xB3, 0xB4, 0x41, 0x41, 0xB5, 0xB6, 0xB7, 0xB8, 0x00, 0x00, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xB9, 0x41, 0xBA, 0x41, 0xBB, + 0xBC, 0xBD, 0xBE, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0xBF, 0xC0, 0xC1, 0x41, 0x41, 0xC2, 0xC3, +]; + +static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [ + 0xFFFFFFFFFFFFFFFF, + 0x03FF000000000000, + 0x07FFFFFE07FFFFFE, + 0x06AC040000000000, + 0xFF7FFFFFFF7FFFFF, + 0x0000501F0003FFC3, + 0xBFDFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFCF, + 0xFFBFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFCFB, + 0xFFFFFFFF03FFFFFF, + 0xBFFFFFFFFFFF19FF, + 0xFFFFFFFFFFFFFFB6, + 0xFFFFFFFF07FF0000, + 0xFFFFC3FFFFFFFFFF, + 0x9FFFFDFF9FEFFFFF, + 0xFFFFFFFFFFFF4000, + 0x3C3FFFFFFFFFFFFF, + 0x8000FFFFFFFFFFFF, + 0xFFFFFFFFBFFFFFFF, + 0xFFFFFFFFFFFCFEFF, + 0xFFFFFFFBFFFFFFFF, + 0xFFFEFFCFFFFFFFFF, + 0xD003FFFFFFFFFFFF, + 0xFFFCFFFFFFFFFFFF, + 0xFF02FFFFFFFFFFFF, + 0xF800FFFFFFFFFFFF, + 0x007FFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFEF, + 0xFC00FFFF80FF7FFF, + 0xFFEFFFFFFFFFFFFF, + 0x7FFFFFFFFFFFFFFF, + 0xFFFFFFFFF3FF7FFF, + 0xC2A003FF03000801, + 0x3FFFFFFFFFFFFFDF, + 0xFFFFFFFFF8002040, + 0xFFFFFFFFFFFF03FF, + 0xFFFFFFFF3FFFFFFF, + 0xF7FFFFFFFFFFFFFF, + 0xE003FE00FFFFFFFF, + 0xFFFFFFFFFC00FFFF, + 0xFFFFFFFFFFFFFFFE, + 0xFFFF9FFFFFFFFFFF, + 0xFFFFFFFFE7FFFFFE, + 0xFFFFC7FFFFFFFFFF, + 0xFF9FFFFFFFFFFFFF, + 0xFC00FFFFF08FFFFF, + 0xFFFFFFFFFFFFB800, + 0xFFFFFFFFFFFFFFCE, + 0x000000003FFFFFFF, + 0xBFFFC080FFFFFFFF, + 0x800FF80003FFFFFF, + 0x0FFFFFFFFFFFFFFF, + 0x07FFFFFFFFFFFFFF, + 0x3FFFFFFFFFFFFFFF, + 0xFFFFFFFFFFF7FF00, + 0x5FFFFFFFFFFFFFFF, + 0x9FFF1FFF1FFF1FFC, + 0x0000000000003000, + 0x9FFF002000000000, + 0x00000000FFFF9FFF, + 0xFFFFFFE21FFFFFFE, + 0xF3FFFD503F2FFC84, + 0xFFFFFFFF000043E0, + 0x000000000000F1FF, + 0x0000000000000000, + 0x0000000040000084, + 0xFFFFFF8000000000, + 0x00000000FFFFF800, + 0x0030000000000000, + 0x0000000000400000, + 0x01FFF81FFFFFFFFF, + 0xFFFEFFFFFFFFFFFF, + 0xFFFFFFFFC0000000, + 0x0000000004000000, + 0xFFF0000000000000, + 0x0000FFFFFFC00000, + 0x1F3EFFFE000000E0, + 0xFFFFFFFEFFFFFFFF, + 0xFFFFFFFF0000FFFF, + 0xFFFF7FF000000000, + 0x0000000080000000, + 0x000000000000FFFF, + 0x3FFFFFFFFFFFFF80, + 0xFFFFFFFFFFFF1FFF, + 0xBFF0FFFFFFFFFFFF, + 0xFF03FFFFFFFFFFFF, + 0xFFFFFFFCFF800000, + 0xFFFFFFFFFFFFF9FF, + 0xFC00F0FFFFFFFFFF, + 0xFF0FFFFFFFFFFFFF, + 0xE8FFFFFFFFFF3FFF, + 0xFFFF3FFFFFFFFFFF, + 0xFFFFFFFF7FFFFFFF, + 0xFFFFFFFF3FFFC001, + 0xFC7FFFFF0FFFFFFF, + 0xFFFCFFFF3FFFFFFF, + 0xFFFFF3FFF7FFFFFF, + 0xFFFFF7FFFFFFFFFF, + 0xFFFFFDFFFFFFFFFF, + 0x0003FFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFF8, + 0xFFFFFFFFFFFF0000, + 0x0FFFFFFFFFFF7FFF, + 0x0000FFFFFC00FFFF, + 0xFFFFF08000080000, + 0x07FFFFFE03FF0001, + 0xFFFFFFE007FFFFFE, + 0xC1FF8080FFFFFFFF, + 0x0070000000000078, + 0x001FFFFFFFFFFFFF, + 0xFFFFFFFEE0008000, + 0xE00000000000FFFF, + 0xF0000001FFFFFFFF, + 0xFFFFFFF0FFFFFFFF, + 0xFFFFFFFFFFFEFFFF, + 0xFFFF7FFFFFFFFFFF, + 0x007FFFFF007FFFFF, + 0xFFFF007FFFFFFFFF, + 0x7FFFFFFF703FFFFF, + 0xCFFFFFFFFFFFFFFF, + 0x0000000000030000, + 0x1FFFFFFFFE00FE00, + 0xFFFFFFFF1FFFFFFF, + 0xFF8007FFFFFFFEFF, + 0x01FFFFFFFFFFFFFF, + 0x00FFFFFF00FFFFFF, + 0xFFFF01FFE1FFFFFF, + 0x03FFFFFFFFFFFFFF, + 0x80000000FFFFFFFF, + 0xFFFFDFFFFFFFFFFF, + 0xFFFFFF801FFFFFFF, + 0xFFFFFFFFFC01FFFF, + 0xFFFFFFFFFFFFFC3F, + 0xFFFFFFFFFFFFF01F, + 0xFFFFFFC00003C07F, + 0xFFFFFFFFFFFFDFFC, + 0xFFCFFFFFFFFFFFF0, + 0xFFE0000117FFDE1F, + 0xC0FFFFFFFFFFFFFF, + 0xFFFFFFFFD3FF07FF, + 0xFFFFFFFFFFFFFFBF, + 0xFFFFFFFFFF000001, + 0xFFFFE000FFFFFFF1, + 0xFDFFFFFFFFFFFFFF, + 0xFFF803FFFFFFFFFF, + 0xFFFFFFFFFFFFFF8F, + 0xFFFFFFFFFFFFFF80, + 0xFFFFFFF823FFFFFF, + 0xFFFFFFFFFFFFFC00, + 0xFFFCE00003FFFFC1, + 0xFE7FFFFFFFFFFFFF, + 0xFFFFFFFFFFFF0007, + 0x7FFC000000000000, + 0xFFE0FFFFFFFFFFFF, + 0xFFF9FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFDFFFFFFFFFFFFF, + 0xFFFFFFFC07FFFFCF, + 0xFFFFFFFFF8000000, + 0xFFFFFFF06FFFFFFF, + 0xFFFFFFFFFFFFFFF0, + 0xFFC0000000000000, + 0x0000018000000000, + 0xF807E3E000000000, + 0x00003C0000000FE7, + 0xFFFFF80000000000, + 0xFFFFFFFFFFFFFFDC, + 0xFFF00000FFF00000, + 0xFE000000FF800000, + 0xF87FFFFFFFFFFFFF, + 0x00201FFFFFFFFFFF, + 0xFFFFFFFFFFFFF010, + 0xFFFFFFFFFFFF7FFF, + 0xFFFFFFFFFFFF007F, + 0x0001FFFFFFFFFFFF, + 0xFFE0000000000000, + 0xC000000000000001, + 0x0000F00000000000, + 0x00018000FFF00000, + 0xFFC0000000010001, + 0xFFFFC00000000000, + 0x0000003FFFFFFFFF, + 0xF00000000000FFF8, + 0xFFFFFFC0FFFCFE00, + 0xE000E0000F000000, + 0x0780000000000000, + 0xFFFEF000FC000000, + 0x000000000000F000, + 0x00000000FC00FF00, + 0xFFFCC0000000FF00, + 0xE000C000FFF00000, + 0x400000000000FE00, + 0xFE00FE00F0003FC0, + 0x0000000000080000, + 0xFFFFFFFFFFFFF800, +]; + +#[cfg(test)] +#[test] +fn test_allowed_in_words_casing_closure() { + for c in '\0'..=char::MAX { + if allowed_in_word(c) { + assert!(c.to_uppercase().all(allowed_in_word)); + assert!(c.to_lowercase().all(allowed_in_word)); + } + } +} diff --git a/tables/Cargo.toml b/tables/Cargo.toml new file mode 100644 index 0000000..8897a01 --- /dev/null +++ b/tables/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "tables" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitvec = "1.0.1" +regex = "1.10.3" +reqwest = { version = "0.11", features = ["blocking"] } +rustc-hash = "1.1.0" diff --git a/tables/src/allowed_in_word.rs b/tables/src/allowed_in_word.rs new file mode 100644 index 0000000..3c80be6 --- /dev/null +++ b/tables/src/allowed_in_word.rs @@ -0,0 +1,219 @@ +//! Construct a lookup table to find whether a particular character is allowed in words. + +use std::{ + any::type_name, + collections::hash_map, + error::Error, + io::{self, Read}, + mem::size_of, +}; + +use bitvec::prelude::*; +use rustc_hash::FxHashMap; + +use crate::unicode_data::{set_by_general_category, set_by_prop, CodepointBitArr, DataFiles}; + +/// Change this to u128 for wider leaves +type LeafElement = u64; + +const ENTRIES_PER_LEAF: usize = LeafElement::BITS as usize; + +/// `true` for all punctuation other than `Other_Punctuation` +/// (`[\p{Punctuation}-\p{Other_Punctuation}]`) +fn punctuation_non_other(data: &DataFiles) -> CodepointBitArr { + let mut arr = BitArray::ZERO; + set_by_general_category(&mut arr, data, "Pc|Pd|Ps|Pe|Pi|Pf", true); + arr +} + +/// `true` for all unassigned and private use characters +fn unassigned_private_use(data: &DataFiles) -> CodepointBitArr { + let mut arr = BitArray::ZERO; + set_by_general_category(&mut arr, data, "[A-Za-z]+", true); + set_by_general_category(&mut arr, data, "Cn|Co", false); + !arr +} + +/// `true` for all codepoints that can be part of a word: +/// `[\p{Unassigned}\p{Private_Use}\p{ID_Continue}\p{ID_Compat_Math_Continue}-[\p{Punctuation}-\p{Other_Punctuation}]]`, +/// plus the extra characters listed below. +pub fn allowed_in_word(data: &DataFiles) -> CodepointBitArr { + let mut word_component = unassigned_private_use(data); + + set_by_prop( + &mut word_component, + &data.derived_core_properties, + "ID_Continue", + true, + ); + + set_by_prop( + &mut word_component, + &data.prop_list, + "ID_Compat_Math_Continue", + true, + ); + + // Choose from characters in https://www.unicode.org/reports/tr31/#Specific_Character_Adjustments + // that are not Punctuation other than Other_Punctuation + // (U+00B7 is already in ID_Continue). + for cp in [ + 0x05F3, // HEBREW PUNCTUATION GERESH https://en.wikipedia.org/wiki/Geresh + 0x05F4, // HEBREW PUNCTUATION GERSHAYIM https://en.wikipedia.org/wiki/Gershayim + 0x0F0B, // TIBETAN MARK INTERSYLLABIC TSHEG https://w3c.github.io/tlreq/#language_overview + ] { + word_component.set(cp, true); + } + + word_component &= !punctuation_non_other(data); + + word_component +} + +fn build_tree(allowed_in_word: &BitSlice) -> (Vec, Vec) { + let mut chunk_to_leaf_idx_map: FxHashMap = FxHashMap::from_iter([(!0, 0)]); + let mut root = Vec::with_capacity(allowed_in_word.len().div_ceil(ENTRIES_PER_LEAF)); + let mut leaves = vec![!0]; + let chunks_iter = allowed_in_word.chunks_exact(ENTRIES_PER_LEAF); + assert!(chunks_iter.remainder().is_empty()); + let mut chunks_iter = chunks_iter.map(|l| { + LeafElement::from_le_bytes( + l.bytes() + .collect::, _>>() + .unwrap() + .try_into() + .unwrap(), + ) + }); + for chunk in &mut chunks_iter { + match chunk_to_leaf_idx_map.entry(chunk) { + hash_map::Entry::Occupied(o) => { + root.push(*o.get()); + } + hash_map::Entry::Vacant(v) => { + let new_index = u8::try_from(leaves.len()).expect("too many leaves"); + v.insert(new_index); + root.push(new_index); + leaves.push(chunk); + } + } + } + (root, leaves) +} + +fn list_of_ranges(cps: impl Iterator) -> Vec<(u32, u32)> { + let mut vec = Vec::new(); + for cp in cps { + if let Some((_, prev)) = vec.last_mut() { + if *prev + 1 == cp { + *prev = cp; + continue; + } + } + vec.push((cp, cp)) + } + vec +} + +pub fn write_table( + out: &mut impl io::Write, + allowed_in_word: &CodepointBitArr, +) -> Result<(), Box> { + let bits_to_shift = ENTRIES_PER_LEAF.ilog2(); + + let first_cp_not_in_tree = + (allowed_in_word[..0x40000].last_zero().unwrap() + 1).next_multiple_of(ENTRIES_PER_LEAF); + + let first_cp_not_in_tree_shifted = first_cp_not_in_tree >> bits_to_shift; + + writeln!( + out, + "/// Whether this character can be part of a word. +pub fn allowed_in_word(c: char) -> bool {{ + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << {bits_to_shift}); + let cp: u32 = c.into(); + let top_bits = cp >> {bits_to_shift}; + if top_bits < 0x{first_cp_not_in_tree_shifted:X} {{ + let leaf_idx: u8 = ALLOWED_IN_WORD_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = ALLOWED_IN_WORD_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + }} else {{" + )?; + + let mut late_zeros = list_of_ranges( + allowed_in_word[first_cp_not_in_tree..] + .iter_zeros() + .map(|n| u32::try_from(n + first_cp_not_in_tree).unwrap()), + ) + .into_iter(); + if let Some(first_late_zero) = late_zeros.next() { + write!(out, " !matches!(cp, 0x{:06X}", first_late_zero.0)?; + if first_late_zero.0 != first_late_zero.1 { + write!(out, "..=0x{:06X}", first_late_zero.1)?; + } + for late_zero in late_zeros { + write!(out, " | 0x{:06X}", late_zero.0)?; + if late_zero.0 != late_zero.1 { + write!(out, "..=0x{:06X}", late_zero.1)?; + } + } + writeln!(out, ")")?; + } else { + writeln!(out, "true")?; + } + + writeln!( + out, + " }} +}}", + )?; + + let (root, leaves) = build_tree(&allowed_in_word[..first_cp_not_in_tree]); + eprintln!( + "allowed_in_words: {} bytes of static data", + root.len() + leaves.len() * size_of::() + ); + + write!( + out, + "\nstatic ALLOWED_IN_WORD_ROOT: [u8; {}] = [", + root.len() + )?; + + for line in root.chunks(16) { + write!(out, "\n ")?; + for byte in line { + write!(out, " 0x{byte:02X},")?; + } + } + + writeln!( + out, + "\n]; + +static ALLOWED_IN_WORD_LEAVES: [{}; {}] = [", + type_name::(), + leaves.len() + )?; + + for leaf in leaves { + writeln!(out, " 0x{leaf:016X},")?; + } + writeln!( + out, + "]; + +#[cfg(test)] +#[test] +fn test_allowed_in_words_casing_closure() {{ + for c in '\\0'..=char::MAX {{ + if allowed_in_word(c) {{ + assert!(c.to_uppercase().all(allowed_in_word)); + assert!(c.to_lowercase().all(allowed_in_word)); + }} + }} +}}" + )?; + + Ok(()) +} diff --git a/tables/src/main.rs b/tables/src/main.rs new file mode 100644 index 0000000..2932812 --- /dev/null +++ b/tables/src/main.rs @@ -0,0 +1,37 @@ +use std::{fs::OpenOptions, io::Write}; + +mod unicode_data; +use unicode_data::data_files; + +/// Update this on new Unicode releases +const UNICODE_VERSION: (u8, u8, u8) = (15, 1, 0); + +mod allowed_in_word; + +fn main() -> Result<(), Box> { + let data = data_files()?; + + let mut out = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open("../src/tables.rs")?; + + writeln!( + &mut out, + "//! Automatically generated by `tables`. +//! Do not edit manually. + +/// The version of Unicode that the data included with this crate is based on. +/// +/// This crate also relies on Unicode data provided by the standard library; +/// that data is versioned according to [`char::UNICODE_VERSION`]. +pub const UNICODE_VERSION: (u8, u8, u8) = {UNICODE_VERSION:?}; +" + )?; + + let allowed_in_word = allowed_in_word::allowed_in_word(&data); + allowed_in_word::write_table(&mut out, &allowed_in_word)?; + + Ok(()) +} diff --git a/tables/src/unicode_data.rs b/tables/src/unicode_data.rs new file mode 100644 index 0000000..d69f064 --- /dev/null +++ b/tables/src/unicode_data.rs @@ -0,0 +1,93 @@ +use bitvec::BitArr; +use regex::Regex; + +use crate::UNICODE_VERSION; + +pub type CodepointBitArr = BitArr!(for 0x110000); + +/// Download the specified Unicode data file from the Unicode website, +/// using the version specified in [`UNICODE_VERSION`]. +fn fetch_unicode_file(file: &str) -> Result> { + Ok(reqwest::blocking::get(format!( + "https://www.unicode.org/Public/{}.{}.{}/ucd/{file}", + UNICODE_VERSION.0, UNICODE_VERSION.1, UNICODE_VERSION.2 + ))? + .error_for_status()? + .text()?) +} + +#[derive(Debug)] +pub struct DataFiles { + pub unicode_data: String, + pub derived_core_properties: String, + pub prop_list: String, + pub scripts: String, +} + +/// Retrieve all the data files we need. +pub fn data_files() -> Result> { + Ok(DataFiles { + unicode_data: fetch_unicode_file("UnicodeData.txt")?, + derived_core_properties: fetch_unicode_file("DerivedCoreProperties.txt")?, + prop_list: fetch_unicode_file("PropList.txt")?, + scripts: fetch_unicode_file("Scripts.txt")?, + }) +} + +/// - `arr`: bit array (1 bit per unicode code point) +/// - `data_file`: Unicode data file to look for properties in +/// - `props`: regex matching one or more Unicode properties +/// - `set_to`: what we should set the bits corresponding to matching code points to +pub fn set_by_prop(arr: &mut CodepointBitArr, data_file: &str, props: &str, set_to: bool) { + let regex_string = format!(r"^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(?:{props})"); + let regex = Regex::new(®ex_string).unwrap(); + for line in data_file.lines() { + if let Some(captures) = regex.captures(line) { + let start = usize::from_str_radix(&captures[1], 16).unwrap(); + let codepoint_range = start + ..=captures + .get(2) + .map_or(start, |m| usize::from_str_radix(m.as_str(), 16).unwrap()); + for cp in codepoint_range { + arr.set(cp, set_to); + } + } + } +} + +/// - `arr`: bit array (1 bit per unicode code point) +/// - `props`: regex matching one or more Unicode character categories +/// - `set_to`: what we should set the bits corresponding to matching code points to +pub fn set_by_general_category( + arr: &mut CodepointBitArr, + data: &DataFiles, + categories: &str, + set_to: bool, +) { + let regex_string = format!(r"^([0-9A-F]+);(.*?);({categories});"); + let regex = Regex::new(®ex_string).unwrap(); + let mut range_start: Option<(usize, String, String)> = None; + for line in data.unicode_data.lines() { + if let Some(captures) = regex.captures(line) { + let cp = usize::from_str_radix(&captures[1], 16).unwrap(); + + if let Some((range_start_cp, prefix, category)) = range_start { + assert_eq!(captures[2].strip_suffix(", Last>"), Some(prefix).as_deref()); + assert_eq!(category, &captures[3]); + range_start = None; + for cp_within_range in range_start_cp..=cp { + arr.set(cp_within_range, set_to); + } + } else if let Some(prefix) = captures[2].strip_suffix(", First>") { + assert!(range_start.is_none()); + range_start = Some((cp, prefix.to_owned(), captures[3].to_owned())); + } else { + assert!(range_start.is_none()); + arr.set(cp, set_to); + } + } else { + assert!(range_start.is_none()); + } + } + assert!(range_start.is_none()); +} From 4153d581f12736bfb9630cd0da87a6936fddf339 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 14 Mar 2024 12:32:27 -0400 Subject: [PATCH 2/5] Use UTS 55 rules for determining word boundaries Requires 5945 additional bytes of static data. Some existing tests had to be modified, as the old algorithm sometimes inserted word boundaries after digits in cases where the new one does not. --- src/lib.rs | 176 +++++++------- src/snake.rs | 6 +- src/tables.rs | 432 +++++++++++++++++++++++++++++++++ src/train.rs | 21 +- tables/src/letter_casing.rs | 192 +++++++++++++++ tables/src/main.rs | 6 + tables/src/nonspacing_marks.rs | 168 +++++++++++++ 7 files changed, 909 insertions(+), 92 deletions(-) create mode 100644 tables/src/letter_casing.rs create mode 100644 tables/src/nonspacing_marks.rs diff --git a/src/lib.rs b/src/lib.rs index 4aea701..ae2a3d3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,25 +6,9 @@ //! //! ## Definition of a word boundary //! -//! Word boundaries are defined by non-alphanumeric characters, as well as -//! within those words in this manner: -//! -//! 1. If an uppercase character is followed by lowercase letters, a word -//! boundary is considered to be just prior to that uppercase character. -//! 2. If multiple uppercase characters are consecutive, they are considered to -//! be within a single word, except that the last will be part of the next word -//! if it is followed by lowercase characters (see rule 1). -//! -//! That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is -//! segmented `XML|Http|Request`. -//! -//! Characters not within words (such as spaces, punctuations, and underscores) -//! are not included in the output string except as they are a part of the case -//! being converted to. Multiple adjacent word boundaries (such as a series of -//! underscores) are folded into one. ("hello__world" in snake case is therefore -//! "hello_world", not the exact same string). Leading or trailing word boundary -//! indicators are dropped, except insofar as CamelCase capitalizes the first -//! word. +//! Word boundaries are defined by the specification of +//! [identifier chunks](https://www.unicode.org/reports/tr55/#Identifier-Chunks) +//! in Unicode Technical Standard 55. //! //! ### Cases contained in this library: //! @@ -68,6 +52,8 @@ pub use upper_camel::{ use core::fmt; +use tables::{is_non_greek_titlecase, CasedLetterKind}; + fn transform( s: &str, mut with_word: F, @@ -78,82 +64,100 @@ where F: FnMut(&str, &mut fmt::Formatter) -> fmt::Result, G: FnMut(&mut fmt::Formatter) -> fmt::Result, { - /// Tracks the current 'mode' of the transformation algorithm as it scans - /// the input string. - /// - /// The mode is a tri-state which tracks the case of the last cased - /// character of the current word. If there is no cased character - /// (either lowercase or uppercase) since the previous word boundary, - /// than the mode is `Boundary`. If the last cased character is lowercase, - /// then the mode is `Lowercase`. Othertherwise, the mode is - /// `Uppercase`. - #[derive(Clone, Copy, PartialEq)] - enum WordMode { - /// There have been no lowercase or uppercase characters in the current - /// word. - Boundary, - /// The previous cased character in the current word is lowercase. - Lowercase, - /// The previous cased character in the current word is uppercase. - Uppercase, - } - let mut first_word = true; for word in s.split(|c: char| !tables::allowed_in_word(c)) { - let mut char_indices = word.char_indices().peekable(); - let mut init = 0; - let mut mode = WordMode::Boundary; - - while let Some((i, c)) = char_indices.next() { - if let Some(&(next_i, next)) = char_indices.peek() { - // The mode including the current character, assuming the - // current character does not result in a word boundary. - let next_mode = if c.is_lowercase() { - WordMode::Lowercase - } else if c.is_uppercase() { - WordMode::Uppercase - } else { - mode - }; - - // Word boundary after if current is not uppercase and next - // is uppercase - if next_mode == WordMode::Lowercase && next.is_uppercase() { - if !first_word { - boundary(f)?; + let mut start_of_word_idx = 0; + // Whether the previous character seen, ignoring nonspacing marks, + // was lowercase or non-Greek titlecase. + // Used for determining CamelBoundaries. + let mut prev_was_lowercase_or_non_greek_titlecase = false; + // If the previous character seen, ignoring nonspacing marks, + // was uppercase or titlecase, then this stores that character's index. + // Otherwise, it stores `None`. + // Used for determining HATBoundaries. + let mut index_of_preceding_uppercase_or_titlecase_letter: Option = None; + + for (i, c) in word.char_indices() { + match tables::letter_casing(c) { + None => { + // Nonspacing marks are ignored for the purpose of determining boundaries. + if !tables::is_nonspacing_mark(c) { + prev_was_lowercase_or_non_greek_titlecase = false; + index_of_preceding_uppercase_or_titlecase_letter = None; + } + } + Some(CasedLetterKind::Lowercase) => { + prev_was_lowercase_or_non_greek_titlecase = true; + // There is a HATBoundary before an uppercase or titlecase letter followed by a lowercase letter + if let Some(preceding_idx) = index_of_preceding_uppercase_or_titlecase_letter { + index_of_preceding_uppercase_or_titlecase_letter = None; + if preceding_idx != start_of_word_idx { + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..preceding_idx], f)?; + start_of_word_idx = preceding_idx; + } } - with_word(&word[init..next_i], f)?; - first_word = false; - init = next_i; - mode = WordMode::Boundary; - - // Otherwise if current and previous are uppercase and next - // is lowercase, word boundary before - } else if mode == WordMode::Uppercase && c.is_uppercase() && next.is_lowercase() { - if !first_word { - boundary(f)?; + } + Some(CasedLetterKind::Uppercase) => { + index_of_preceding_uppercase_or_titlecase_letter = Some(i); + // There is a CamelBoundary before an uppercase letter + // that is preceded by a lowercase or non-Greek titlecase letter + if prev_was_lowercase_or_non_greek_titlecase { + prev_was_lowercase_or_non_greek_titlecase = false; + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..i], f)?; + start_of_word_idx = i; + } + } + Some(CasedLetterKind::Titlecase) => { + index_of_preceding_uppercase_or_titlecase_letter = Some(i); + // There is always a HATBoundary before a non-Greek titlecase letter + if is_non_greek_titlecase(c) { + prev_was_lowercase_or_non_greek_titlecase = true; + if i != start_of_word_idx { + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..i], f)?; + start_of_word_idx = i; + } } else { - first_word = false; + // There is a CamelBoundary before a titlecase letter + // that is preceded by a lowercase or non-Greek titlecase letter + if prev_was_lowercase_or_non_greek_titlecase { + prev_was_lowercase_or_non_greek_titlecase = false; + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..i], f)?; + start_of_word_idx = i; + } } - with_word(&word[init..i], f)?; - init = i; - mode = WordMode::Boundary; - - // Otherwise no word boundary, just update the mode - } else { - mode = next_mode; } + } + } + + if start_of_word_idx != word.len() { + // Collect trailing characters as a word + if !first_word { + boundary(f)?; } else { - // Collect trailing characters as a word - if !first_word { - boundary(f)?; - } else { - first_word = false; - } - with_word(&word[init..], f)?; - break; + first_word = false; } + with_word(&word[start_of_word_idx..], f)?; } } diff --git a/src/snake.rs b/src/snake.rs index c3c8576..d7a2c6c 100644 --- a/src/snake.rs +++ b/src/snake.rs @@ -87,14 +87,14 @@ mod tests { t!(test12: "99BOTTLES" => "99bottles"); t!(test13: "FieldNamE11" => "field_nam_e11"); t!(test14: "abc123def456" => "abc123def456"); - t!(test16: "abc123DEF456" => "abc123_def456"); + t!(test16: "abc123DEF456" => "abc123def456"); t!(test17: "abc123Def456" => "abc123_def456"); - t!(test18: "abc123DEf456" => "abc123_d_ef456"); + t!(test18: "abc123DEf456" => "abc123d_ef456"); t!(test19: "ABC123def456" => "abc123def456"); t!(test20: "ABC123DEF456" => "abc123def456"); t!(test21: "ABC123Def456" => "abc123_def456"); t!(test22: "ABC123DEf456" => "abc123d_ef456"); - t!(test23: "ABC123dEEf456FOO" => "abc123d_e_ef456_foo"); + t!(test23: "ABC123dEEf456FOO" => "abc123d_e_ef456foo"); t!(test24: "abcDEF" => "abc_def"); t!(test25: "ABcDE" => "a_bc_de"); } diff --git a/src/tables.rs b/src/tables.rs index 0daf24d..f595acc 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -360,3 +360,435 @@ fn test_allowed_in_words_casing_closure() { } } } + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum CasedLetterKind { + Lowercase = 1, + Uppercase = 2, + Titlecase = 3, +} + +/// The case of this letter, or `None` if it is not a cased letter. +pub fn letter_casing(c: char) -> Option { + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << 6); + let cp: u32 = c.into(); + let top_bits = cp >> 6; + if top_bits < 0x7A6 { + let leaf_idx: u8 = LETTER_CASING_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = LETTER_CASING_LEAVES[usize::from(leaf_idx)]; + match (leaf >> ((cp & BOTTOM_BITS_MASK) * 2)) & 3 { + 0 => None, + 1 => Some(CasedLetterKind::Lowercase), + 2 => Some(CasedLetterKind::Uppercase), + 3 => Some(CasedLetterKind::Titlecase), + _ => unreachable!(), + } + } else { + None + } +} + +/// Whether the character is a non-Greek titlecase letter. +pub fn is_non_greek_titlecase(c: char) -> bool { + matches!(c, '\u{01C5}' | '\u{01C8}' | '\u{01CB}' | '\u{01F2}') +} + +static LETTER_CASING_ROOT: [u8; 1958] = [ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x00, 0x00, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x15, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x17, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x18, 0x00, 0x19, 0x1A, 0x1B, 0x00, 0x1C, 0x1C, 0x1D, 0x1C, 0x1E, 0x1F, 0x20, 0x21, + 0x00, 0x00, 0x00, 0x00, 0x22, 0x23, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0E, 0x25, 0x1C, 0x26, 0x27, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x29, 0x00, 0x2A, 0x2B, 0x2C, 0x2D, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2E, 0x2F, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x32, 0x33, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x34, 0x35, 0x36, 0x37, 0x00, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x3A, 0x3B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x15, 0x3C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4E, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x4F, 0x50, +]; + +static LETTER_CASING_LEAVES: [u128; 81] = [ + 0x00000000000000000000000000000000, + 0x0015555555555554002AAAAAAAAAAAA8, + 0x00000400000000000000000000000000, + 0x55551555555555556AAA2AAAAAAAAAAA, + 0x99996666666666666666666666666666, + 0x599A6666666666666666666666659999, + 0x561699A9A659A6669A56A69AA5A9A669, + 0x6666A679666666666599999999E79E00, + 0x69A55566666666666666666666666666, + 0x55555555555555555555555566666A99, + 0x00000000555555555555545555555555, + 0x85406066000000000000000000000000, + 0x5555555555AAAA8AAAAAAAA9A22A2000, + 0xA969865566666666666656A595555555, + 0x55555555AAAAAAAAAAAAAAAAAAAAAAAA, + 0x66666666666666665555555555555555, + 0x66666666666666666666666666600006, + 0x6666666666666666666666665999999A, + 0xAAAAAAA8666666666666666666666666, + 0x555555555555555500002AAAAAAAAAAA, + 0x00000000000000000000000000015555, + 0xAAAAAAAAAAAAAAAA0000000000000000, + 0x54155555555555555555555508008AAA, + 0x05550AAAAAAAAAAAAAAAAAAAAAAAAAAA, + 0xA82AAAAAAAAAAAAAAAAAAAAA00015555, + 0x00000000005555555555555555555555, + 0x55545555554000000000000000000000, + 0x00000000000000000015555555555555, + 0x66666666666666666666666666666666, + 0x66666666666666666555566666666666, + 0xAAAA5555AAAA55550AAA0555AAAA5555, + 0x05555555AAAA5555888855550AAA0555, + 0x13AA5155FFFF5555FFFF5555FFFF5555, + 0x03AA515002AA555500AA505503AA5150, + 0xA50401AA4AA222000AA8086A5A908020, + 0x00000000000000000000000010055800, + 0x00000000000000000000000000000180, + 0xA0555966A99996A65555555555555555, + 0x00000060198001666666666666666666, + 0x00000000040045555555555555555555, + 0x00000000066666666666666666666666, + 0x00000000000000000066666666666666, + 0x66666665666666600000000000000000, + 0x69995554666666666666666666666666, + 0x666666AA6AA666666666656619806666, + 0x0010180000000000000664460019AA66, + 0x55555555000000000000000000000000, + 0x55555555000155550015555555555555, + 0x55555555555555555555555555555555, + 0x00000000000000000000554000001555, + 0x002AAAAAAAAAAAA80000000000000000, + 0x00000000000000000015555555555554, + 0x555555555555AAAAAAAAAAAAAAAAAAAA, + 0x00000000000000000000000055555555, + 0xAAAAAAAA000000000000000000000000, + 0x0055555555555555555500AAAAAAAAAA, + 0xAA2AAAAA000000000000000000000000, + 0x014555455555554555554A2AAA2AAAAA, + 0x0000002AAAAAAAAAAAAAAAAAAAAAAAAA, + 0x00000015555555555555555555555555, + 0x00000000000000005555555555555555, + 0x5555555555555555AAAAAAAAAAAAAAAA, + 0xAAAAAA5555555555555AAAAAAAAAAAAA, + 0xAAAAAAAAAAAA5555555551555AAAAAAA, + 0x54455AAAA2A82820A25555555555555A, + 0x55555555555AAAAAAAAAAAAA55555455, + 0x2A8A55555555555552AAA2AAA82A8A55, + 0xAAAAAAAAAA55555555555552AAA022AA, + 0x555AAAAAAAAAAAAA5555555555555AAA, + 0x555555555AAAAAAAAAAAAA5555555555, + 0xAA5555555555555AAAAAAAAAAAAA5555, + 0xAAAAAAAA5555555555555AAAAAAAAAAA, + 0xAAAAAAAAAAAA055555555555555AAAAA, + 0x552AAAAAAAAAAAA55515555555555552, + 0x555552AAAAAAAAAAAA55515555555555, + 0x555555552AAAAAAAAAAAA55515555555, + 0x555555555552AAAAAAAAAAAA55515555, + 0x00000000000000000000000000655515, + 0x00000000001554001555555555455555, + 0x555555555555555AAAAAAAAAAAAAAAAA, + 0x00000000000000000000000000000055, +]; + +/// Whether this character is a nonspacing or enclosing mark. +pub fn is_nonspacing_mark(c: char) -> bool { + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << 7); + let cp: u32 = c.into(); + let top_bits = cp >> 7; + if top_bits < 0x3D3 { + let leaf_idx: u8 = NONSPACING_MARKS_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = NONSPACING_MARKS_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + } else { + matches!(cp, 0x0E0100..=0x0E01EF) + } +} + +static NONSPACING_MARKS_ROOT: [u8; 979] = [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x02, 0x00, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B, 0x1C, + 0x1D, 0x1E, 0x1F, 0x00, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x00, 0x26, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x27, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x29, 0x2A, 0x00, 0x00, 0x00, 0x00, + 0x2B, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2D, 0x2E, 0x00, 0x00, + 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x35, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x38, 0x00, 0x39, 0x3A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3B, 0x3C, 0x00, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x3E, 0x3F, 0x40, + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x00, 0x48, 0x49, 0x00, 0x4A, 0x4B, 0x4C, 0x4D, 0x00, + 0x4E, 0x00, 0x4F, 0x50, 0x51, 0x52, 0x00, 0x00, 0x53, 0x54, 0x55, 0x56, 0x00, 0x57, 0x58, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x5A, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5C, 0x5D, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, + 0x00, 0x00, 0x60, 0x61, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x63, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x65, 0x66, 0x5B, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x69, 0x6A, +]; + +static NONSPACING_MARKS_LEAVES: [u128; 107] = [ + 0x00000000000000000000000000000000, + 0x0000FFFFFFFFFFFFFFFFFFFFFFFFFFFF, + 0x000000000000000000000000000003F8, + 0x00000000000000B6BFFFFFFFFFFE0000, + 0x00010000FFFFF8000000000007FF0000, + 0x00003D9F9FC000000000000000000000, + 0x00000000000007FFFFFF000000020000, + 0x200FF800000000000001FFC000000000, + 0x000000000E00000000003EEFFBC00000, + 0xFFFFFFFBFFFFFC0000000000FF000000, + 0x0000000C00FE21FE1400000000000007, + 0x4000000C0000201E1000000000000002, + 0x00230000000239861000000000000006, + 0xFC00000C000021BE1000000000000006, + 0x0000000C0060201E9000000000000002, + 0x00000000000020010000000000000004, + 0x0000000C00603DC1D000000000000011, + 0x0000000C000030409000000000000002, + 0x0000000C0000201E1800000000000003, + 0x00000000005C04000000000000000002, + 0x0000000000007F8007F2000000000000, + 0x0000000000007F001FF2000000000000, + 0x7FFE00000000000002A0000003000000, + 0x00000000000000401FFFFFFFFEFFE0DF, + 0x001E0001C300000066FDE00000000000, + 0x00000000000000000000000020002064, + 0x00000000E00000000000000000000000, + 0x000C0000000C0000000C0000001C0000, + 0x00000000200FFE403FB0000000000000, + 0x0000000000000000000000000000B800, + 0x00000000000000000000020000000060, + 0x00000000000000000E04018700000000, + 0x9FF81FE57F4000000000000009800000, + 0x0000000000007FFFFFFF000000000000, + 0x000FF8000000000417D000000000000F, + 0x0003A3400000000000003B3C00000003, + 0x000000000000000000CFF00000000000, + 0x031021FDFFF700000000000000000000, + 0xFFFFFFFFFFFFFFFF0000000000000000, + 0x0001FFFFFFFF00000000000000000000, + 0x00038000000000000000000000000000, + 0x80000000000000000000000000000000, + 0xFFFFFFFF000000000000000000000000, + 0x000000000000000000003C0000000000, + 0x00000000000000000000000006000000, + 0x3FF78000000000000000000000000000, + 0x000300000000000000000000C0000000, + 0x00000000000000000000106000000844, + 0x8003FFFF000000300000000000000000, + 0x000000000003FF8000003FC000000000, + 0x000000200000000033C8000000000007, + 0x100000000000100800667E0000000000, + 0x0040300000000002C19D000000000000, + 0x00002120000000000000000000000000, + 0x00000000000000000000000040000000, + 0x00000000000000000000FFFF0000FFFF, + 0x20000000000000000000000000000000, + 0x00000001000000000000000000000000, + 0x07C00000000000000000000000000000, + 0x0000000000000000870000000000F06E, + 0x00000060000000000000000000000000, + 0x0000000000000000000000F000000000, + 0xE0000000000000000000180000000000, + 0x000000000001FFC00000000000000000, + 0x0000000000000000000000000000003C, + 0x801900000000007FFF00000000000002, + 0x00000000000000040678000000000003, + 0x0008000000000000001FEF8000000007, + 0x0000000000009E007FC0000000000003, + 0x000000000000000240D3800000000000, + 0x000007F8800000000000000000000000, + 0x001F1FC0000000011800000000000003, + 0x000000004000005CFF00000000000000, + 0x000000000000000D85F8000000000000, + 0x0000000030000001B03C000000000000, + 0x0000000000000001A7F8000000000000, + 0x000000000000000000BF280000000000, + 0x000000000000000000000FBCE0000000, + 0x000000000000000006FF800000000000, + 0x00000000000000085800000000000000, + 0x000000010CF000000000000000000000, + 0x000000000E7E008079F80000000007FE, + 0x000000000000000000000000037FFC00, + 0x0000000000000000BF7F000000000000, + 0x0000000000000000006DFCFFFFFC0000, + 0x00000000000000BFB47E000000000000, + 0x00000000000000000000000000A30000, + 0x00180000000000000000000000000000, + 0x000000000000000507C0000000000003, + 0x00000000003FFF810000000000000000, + 0x001F0000000000000000000000000000, + 0x0000000000000000007F000000000000, + 0x00000000000080000000000000000000, + 0x00000010000000000000000000078000, + 0x00000000000000000000000060000000, + 0x000000000000007FFFFF3FFFFFFFFFFF, + 0xF8000380000000000000000000000000, + 0x000000000000000000003C0000000FE7, + 0x000000000000001C0000000000000000, + 0x00201FFFFFFFFFFFF87FFFFFFFFFFFFF, + 0x00000000000000000000FFFEF8000010, + 0x0000000000000000000007DBF9FFFF7F, + 0x00000000000000000000000000008000, + 0x0000F000000000000000400000000000, + 0x0000F000000000000000000000000000, + 0x00000000007F00000000000000000000, + 0x00000000000007F00000000000000000, +]; diff --git a/src/train.rs b/src/train.rs index 4fcc195..78d3f0e 100644 --- a/src/train.rs +++ b/src/train.rs @@ -74,14 +74,29 @@ mod tests { t!(test12: "99BOTTLES" => "99bottles"); t!(test13: "FieldNamE11" => "Field-Nam-E11"); t!(test14: "abc123def456" => "Abc123def456"); - t!(test16: "abc123DEF456" => "Abc123-Def456"); + t!(test16: "abc123DEF456" => "Abc123def456"); t!(test17: "abc123Def456" => "Abc123-Def456"); - t!(test18: "abc123DEf456" => "Abc123-D-Ef456"); + t!(test18: "abc123DEf456" => "Abc123d-Ef456"); t!(test19: "ABC123def456" => "Abc123def456"); t!(test20: "ABC123DEF456" => "Abc123def456"); t!(test21: "ABC123Def456" => "Abc123-Def456"); t!(test22: "ABC123DEf456" => "Abc123d-Ef456"); - t!(test23: "ABC123dEEf456FOO" => "Abc123d-E-Ef456-Foo"); + t!(test23: "ABC123dEEf456FOO" => "Abc123d-E-Ef456foo"); t!(test24: "abcDEF" => "Abc-Def"); t!(test25: "ABcDE" => "A-Bc-De"); + + t!(uts55_test1: "TypeII" => "Type-Ii"); + t!(uts55_test2: "OCaml" => "O-Caml"); + t!(uts55_test3: "HTTPЗапрос" => "Http-Запрос"); + t!(uts55_test4: "UAX9ClauseHL4" => "Uax9-Clause-Hl4"); + t!(uts55_test5: "LOUD_SNAKE" => "Loud-Snake"); + + t!(uts55_test6: "Fancy_Snake" => "Fancy-Snake"); + t!(uts55_test7: "snake-kebab" => "Snake-Kebab"); + t!(uts55_test8: "Paral·lel" => "Paral·lel"); + t!(uts55_test9: "microB" => "Micro-B"); + t!(uts55_test10: "microᖯ" => "Microᖯ"); + t!(uts55_test11: "HTTPसर्वर" => "Httpसर्वर"); + t!(uts55_test12: "dromedaryCamel" => "Dromedary-Camel"); + t!(uts55_test13: "snakeELEPHANTSnake" => "Snake-Elephant-Snake"); } diff --git a/tables/src/letter_casing.rs b/tables/src/letter_casing.rs new file mode 100644 index 0000000..5917787 --- /dev/null +++ b/tables/src/letter_casing.rs @@ -0,0 +1,192 @@ +use std::{any::type_name, collections::hash_map, error::Error, io, mem::size_of}; + +use bitvec::prelude::*; +use rustc_hash::FxHashMap; + +use crate::unicode_data::{set_by_general_category, set_by_prop, CodepointBitArr, DataFiles}; + +/// Change this to u64 for smaller leaves +type LeafElement = u128; + +const ENTRIES_PER_LEAF: usize = LeafElement::BITS as usize / 2; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum CasedLetterKind { + Lowercase = 1, + Uppercase = 2, + Titlecase = 3, +} + +pub fn letter_casing( + data: &DataFiles, + word_components: &BitSlice, +) -> (Vec>, Vec) { + let mut lowercase = CodepointBitArr::ZERO; + set_by_general_category(&mut lowercase, data, "Ll", true); + lowercase &= word_components; + let mut uppercase = CodepointBitArr::ZERO; + set_by_general_category(&mut uppercase, data, "Lu", true); + uppercase &= word_components; + let mut titlecase = CodepointBitArr::ZERO; + set_by_general_category(&mut titlecase, data, "Lt", true); + titlecase &= word_components; + + let last = [ + lowercase.last_one(), + uppercase.last_one(), + titlecase.last_one(), + ] + .into_iter() + .max() + .flatten() + .unwrap(); + + let mut casing_vec = vec![None; last + 1]; + for cp in lowercase.iter_ones() { + casing_vec[cp] = Some(CasedLetterKind::Lowercase); + } + for cp in uppercase.iter_ones() { + casing_vec[cp] = Some(CasedLetterKind::Uppercase); + } + for cp in titlecase.iter_ones() { + casing_vec[cp] = Some(CasedLetterKind::Titlecase); + } + + set_by_prop(&mut titlecase, &data.scripts, "Greek", false); + + ( + casing_vec, + titlecase + .iter_ones() + .map(|cp| u32::try_from(cp).unwrap()) + .collect(), + ) +} + +fn build_casing_tree(casings_list: &[Option]) -> (Vec, Vec) { + let mut chunk_to_leaf_idx_map: FxHashMap = FxHashMap::from_iter([(!0, 0)]); + let mut root = Vec::with_capacity(casings_list.len().div_ceil(ENTRIES_PER_LEAF)); + let mut leaves = Vec::new(); + let chunks_iter = casings_list.chunks_exact(ENTRIES_PER_LEAF); + assert!(chunks_iter.remainder().is_empty()); + let mut chunks_iter = chunks_iter.map(|c| { + let mut chunk_uint: LeafElement = 0; + for (index, elem) in c.iter().copied().enumerate() { + let bits = elem.map_or(0, |k| k as u8); + chunk_uint |= LeafElement::from(bits) << (index * 2); + } + chunk_uint + }); + for chunk in &mut chunks_iter { + match chunk_to_leaf_idx_map.entry(chunk) { + hash_map::Entry::Occupied(o) => { + root.push(*o.get()); + } + hash_map::Entry::Vacant(v) => { + let new_index = u8::try_from(leaves.len()).expect("too many leaves"); + v.insert(new_index); + root.push(new_index); + leaves.push(chunk); + } + } + } + (root, leaves) +} + +pub fn write_table( + out: &mut impl io::Write, + data: &DataFiles, + allowed_in_word: &CodepointBitArr, +) -> Result<(), Box> { + let (mut casing_vec, non_greek) = letter_casing(data, allowed_in_word); + + let bits_to_shift = ENTRIES_PER_LEAF.ilog2(); + + let first_cp_not_in_tree = casing_vec.len().next_multiple_of(ENTRIES_PER_LEAF); + for _ in casing_vec.len()..first_cp_not_in_tree { + casing_vec.push(None); + } + + let first_cp_not_in_tree_shifted = first_cp_not_in_tree >> bits_to_shift; + + let mut non_greek = non_greek.into_iter(); + + write!( + out, + " +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum CasedLetterKind {{ + Lowercase = 1, + Uppercase = 2, + Titlecase = 3, +}} + +/// The case of this letter, or `None` if it is not a cased letter. +pub fn letter_casing(c: char) -> Option {{ + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << {bits_to_shift}); + let cp: u32 = c.into(); + let top_bits = cp >> {bits_to_shift}; + if top_bits < 0x{first_cp_not_in_tree_shifted:X} {{ + let leaf_idx: u8 = LETTER_CASING_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = LETTER_CASING_LEAVES[usize::from(leaf_idx)]; + match (leaf >> ((cp & BOTTOM_BITS_MASK) * 2)) & 3 {{ + 0 => None, + 1 => Some(CasedLetterKind::Lowercase), + 2 => Some(CasedLetterKind::Uppercase), + 3 => Some(CasedLetterKind::Titlecase), + _ => unreachable!(), + }} + }} else {{ + None + }} +}} + +/// Whether the character is a non-Greek titlecase letter. +pub fn is_non_greek_titlecase(c: char) -> bool {{ + matches!(c, '\\u{{{:04X}}}'", + non_greek.next().unwrap() + )?; + + for cp in non_greek { + write!(out, " | '\\u{{{cp:04X}}}'")?; + } + + writeln!( + out, + ") +}} +" + )?; + + let (root, leaves) = build_casing_tree(&casing_vec); + eprintln!( + "letter_casing: {} bytes of static data", + root.len() + leaves.len() * size_of::() + ); + + write!(out, "static LETTER_CASING_ROOT: [u8; {}] = [", root.len())?; + + for line in root.chunks(16) { + write!(out, "\n ")?; + for byte in line { + write!(out, " 0x{byte:02X},")?; + } + } + + writeln!( + out, + "\n]; + +static LETTER_CASING_LEAVES: [{}; {}] = [", + type_name::(), + leaves.len() + )?; + + for leaf in leaves { + writeln!(out, " 0x{leaf:032X},")?; + } + writeln!(out, "];")?; + Ok(()) +} diff --git a/tables/src/main.rs b/tables/src/main.rs index 2932812..54faa48 100644 --- a/tables/src/main.rs +++ b/tables/src/main.rs @@ -7,6 +7,8 @@ use unicode_data::data_files; const UNICODE_VERSION: (u8, u8, u8) = (15, 1, 0); mod allowed_in_word; +mod letter_casing; +mod nonspacing_marks; fn main() -> Result<(), Box> { let data = data_files()?; @@ -33,5 +35,9 @@ pub const UNICODE_VERSION: (u8, u8, u8) = {UNICODE_VERSION:?}; let allowed_in_word = allowed_in_word::allowed_in_word(&data); allowed_in_word::write_table(&mut out, &allowed_in_word)?; + letter_casing::write_table(&mut out, &data, &allowed_in_word)?; + + nonspacing_marks::write_table(&mut out, &data)?; + Ok(()) } diff --git a/tables/src/nonspacing_marks.rs b/tables/src/nonspacing_marks.rs new file mode 100644 index 0000000..82f8708 --- /dev/null +++ b/tables/src/nonspacing_marks.rs @@ -0,0 +1,168 @@ +use std::{ + any::type_name, + collections::hash_map, + error::Error, + io::{self, Read}, + mem::size_of, +}; + +use bitvec::prelude::*; +use rustc_hash::FxHashMap; + +use crate::unicode_data::{set_by_general_category, CodepointBitArr, DataFiles}; + +type LeafElement = u128; +const ENTRIES_PER_LEAF: usize = LeafElement::BITS as usize; + +fn nonspacing_marks(data: &DataFiles) -> CodepointBitArr { + let mut arr = BitArray::ZERO; + set_by_general_category(&mut arr, data, "Mn|Me", true); + + arr +} + +fn build_tree(nonspacing_marks: &BitSlice) -> (Vec, Vec) { + let mut chunk_to_leaf_idx_map: FxHashMap = FxHashMap::from_iter([(0, 0)]); + let mut root = Vec::with_capacity(nonspacing_marks.len().div_ceil(ENTRIES_PER_LEAF)); + let mut leaves = vec![0]; + let chunks_iter = nonspacing_marks.chunks_exact(ENTRIES_PER_LEAF); + assert!(chunks_iter.remainder().is_empty()); + let mut chunks_iter = chunks_iter.map(|l| { + LeafElement::from_le_bytes( + l.bytes() + .collect::, _>>() + .unwrap() + .try_into() + .unwrap(), + ) + }); + for chunk in &mut chunks_iter { + match chunk_to_leaf_idx_map.entry(chunk) { + hash_map::Entry::Occupied(o) => { + root.push(*o.get()); + } + hash_map::Entry::Vacant(v) => { + let new_index = u8::try_from(leaves.len()).expect("too many leaves"); + v.insert(new_index); + root.push(new_index); + leaves.push(chunk); + } + } + } + (root, leaves) +} + +fn list_of_ranges(nonspacing_marks: &BitSlice, add: usize) -> Vec<(u32, u32)> { + let mut vec = Vec::new(); + for i in nonspacing_marks.iter_ones() { + let cp = u32::try_from(i + add).unwrap(); + if let Some((_, prev)) = vec.last_mut() { + if *prev + 1 == cp { + *prev = cp; + continue; + } + } + vec.push((cp, cp)) + } + vec +} + +pub fn write_table(out: &mut impl io::Write, data: &DataFiles) -> Result<(), Box> { + let marks = nonspacing_marks(data); + + let bits_to_shift = ENTRIES_PER_LEAF.ilog2(); + + let first_cp_not_in_tree = + (marks[..0x40000].last_one().unwrap() + 1).next_multiple_of(ENTRIES_PER_LEAF); + + /*for i in 3..10 { + let entries_per_leaf: usize = 1 << i; + let bytes_per_leaf = entries_per_leaf / 8; + let first_cp_not_in_tree = + (marks[..0x40000].last_one().unwrap() + 1).next_multiple_of(entries_per_leaf); + let leaves = marks[..first_cp_not_in_tree] + .chunks_exact(entries_per_leaf) + .collect::>(); + dbg!(( + bytes_per_leaf, + leaves.len(), + leaves.len() * bytes_per_leaf + first_cp_not_in_tree / entries_per_leaf + )); + }*/ + + let first_cp_not_in_tree_shifted = first_cp_not_in_tree >> bits_to_shift; + + writeln!( + out, + " +/// Whether this character is a nonspacing or enclosing mark. +pub fn is_nonspacing_mark(c: char) -> bool {{ + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << {bits_to_shift}); + let cp: u32 = c.into(); + let top_bits = cp >> {bits_to_shift}; + if top_bits < 0x{first_cp_not_in_tree_shifted:X} {{ + let leaf_idx: u8 = NONSPACING_MARKS_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = NONSPACING_MARKS_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + }} else {{" + )?; + + let mut late_marks = + list_of_ranges(&marks[first_cp_not_in_tree..], first_cp_not_in_tree).into_iter(); + + if let Some(first_late_mark) = late_marks.next() { + write!( + out, + " matches!(cp, 0x{:06X}..=0x{:06X}", + first_late_mark.0, first_late_mark.1 + )?; + for late_mark in late_marks { + write!(out, " | 0x{:06X}..=0x{:06X}", late_mark.0, late_mark.1)?; + } + writeln!(out, ")")?; + } else { + writeln!(out, "false")?; + } + + writeln!( + out, + " }} +}} +", + )?; + + let (root, leaves) = build_tree(&marks[..first_cp_not_in_tree]); + eprintln!( + "nonspacing_marks: {} bytes of static data", + root.len() + leaves.len() * size_of::() + ); + + write!( + out, + "static NONSPACING_MARKS_ROOT: [u8; {}] = [", + root.len() + )?; + + for line in root.chunks(16) { + write!(out, "\n ")?; + for byte in line { + write!(out, " 0x{byte:02X},")?; + } + } + + writeln!( + out, + "\n]; + +static NONSPACING_MARKS_LEAVES: [{}; {}] = [", + type_name::(), + leaves.len() + )?; + + for leaf in leaves { + writeln!(out, " 0x{leaf:032X},")?; + } + writeln!(out, "];")?; + + Ok(()) +} From 8495a6bd203aee44c36d16f1b0dcb0bf0bd7ea14 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 14 Mar 2024 20:41:39 -0400 Subject: [PATCH 3/5] Use correct titlecase mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Unicode characters consist of a pair (or even triple) of letters; when title-casing, only the first member of the pair should be capitalized. For example, U+01C6 (dž) uppercases to U+01C4 (DŽ) but titlecases to U+01C5 (Dž). This adds 2160 bytes of static data. --- src/lib.rs | 24 +++- src/lower_camel.rs | 4 +- src/tables.rs | 226 +++++++++++++++++++++++++++++++++++++ src/title.rs | 4 +- src/train.rs | 6 +- src/upper_camel.rs | 4 +- tables/src/main.rs | 20 +++- tables/src/titlecase.rs | 186 ++++++++++++++++++++++++++++++ tables/src/unicode_data.rs | 2 + 9 files changed, 455 insertions(+), 21 deletions(-) create mode 100644 tables/src/titlecase.rs diff --git a/src/lib.rs b/src/lib.rs index ae2a3d3..9aec025 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -185,13 +185,25 @@ fn uppercase(s: &str, f: &mut fmt::Formatter) -> fmt::Result { Ok(()) } -fn capitalize(s: &str, f: &mut fmt::Formatter) -> fmt::Result { - let mut char_indices = s.char_indices(); - if let Some((_, c)) = char_indices.next() { - write!(f, "{}", c.to_uppercase())?; - if let Some((i, _)) = char_indices.next() { - lowercase(&s[i..], f)?; +fn titlecase(s: &str, f: &mut fmt::Formatter) -> fmt::Result { + // Find the first cased character + if let Some(titlecase_idx) = + s.find(|c| tables::letter_casing(c).is_some() || c.is_lowercase() || c.is_uppercase()) + { + // Everything before the first cased character is passed through unchanged. + f.write_str(&s[..titlecase_idx])?; + + let rem = &s[titlecase_idx..]; + let mut char_indices = rem.char_indices(); + if let Some((_, c)) = char_indices.next() { + write!(f, "{}", tables::to_titlecase(c))?; + if let Some((i, _)) = char_indices.next() { + lowercase(&rem[i..], f)?; + } } + } else { + // If there are no cased characters, pass through the string unchanged + write!(f, "{}", s)?; } Ok(()) diff --git a/src/lower_camel.rs b/src/lower_camel.rs index a31fc33..50d6dac 100644 --- a/src/lower_camel.rs +++ b/src/lower_camel.rs @@ -5,7 +5,7 @@ use alloc::{ string::{String, ToString}, }; -use crate::{capitalize, lowercase, transform}; +use crate::{lowercase, titlecase, transform}; /// This trait defines a lower camel case conversion. /// @@ -53,7 +53,7 @@ impl> fmt::Display for AsLowerCamelCase { first = false; lowercase(s, f) } else { - capitalize(s, f) + titlecase(s, f) } }, |_| Ok(()), diff --git a/src/tables.rs b/src/tables.rs index f595acc..2447b9a 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -792,3 +792,229 @@ static NONSPACING_MARKS_LEAVES: [u128; 107] = [ 0x00000000007F00000000000000000000, 0x00000000000007F00000000000000000, ]; + +use core::{ + fmt::{self, Write}, + iter, +}; + +#[derive(Clone, Debug)] +pub enum ToTitlecase { + Zero, + One(char), + Two(char, char), + Three(char, char, char), +} + +impl Iterator for ToTitlecase { + type Item = char; + + fn next(&mut self) -> Option { + match *self { + Self::Zero => None, + Self::One(c) => { + *self = Self::Zero; + Some(c) + } + Self::Two(b, c) => { + *self = Self::One(c); + Some(b) + } + Self::Three(a, b, c) => { + *self = Self::Two(b, c); + Some(a) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let size = match self { + Self::Zero => 0, + Self::One(_) => 1, + Self::Two(..) => 2, + Self::Three(..) => 3, + }; + (size, Some(size)) + } +} + +impl iter::ExactSizeIterator for ToTitlecase {} + +impl iter::FusedIterator for ToTitlecase {} + +impl fmt::Display for ToTitlecase { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for c in self.clone() { + f.write_char(c)?; + } + Ok(()) + } +} + +/// Returns an iterator that yields the titlecase mapping of this `char` as one or more `char`s. +pub fn to_titlecase(c: char) -> ToTitlecase { + // ASCII fast path + if c.is_ascii() { + ToTitlecase::One(c.to_ascii_uppercase()) + } else if let Ok(idx) = TITLECASE_MAPPINGS.binary_search_by_key(&c, |&(c2, _)| c2) { + match TITLECASE_MAPPINGS[idx].1 { + [None, ..] => ToTitlecase::Zero, + [Some(a), None, ..] => ToTitlecase::One(a), + [Some(a), Some(b), None] => ToTitlecase::Two(a, b), + [Some(a), Some(b), Some(c)] => ToTitlecase::Three(a, b, c), + } + } else { + let mut uppercase = c.to_uppercase(); + match uppercase.size_hint().0 { + 0 => ToTitlecase::Zero, + 1 => ToTitlecase::One(uppercase.next().unwrap()), + 2 => ToTitlecase::Two(uppercase.next().unwrap(), uppercase.next().unwrap()), + 3 => ToTitlecase::Three( + uppercase.next().unwrap(), + uppercase.next().unwrap(), + uppercase.next().unwrap(), + ), + _ => unreachable!(), + } + } +} + +/// Sorted list of characters and their titlecase mappings. +/// Only characters whose titlecase differs from uppercase are included. +static TITLECASE_MAPPINGS: [(char, [Option; 3]); 135] = [ + ('ß', [Some('S'), Some('s'), None]), + ('DŽ', [Some('Dž'), None, None]), + ('Dž', [Some('Dž'), None, None]), + ('dž', [Some('Dž'), None, None]), + ('LJ', [Some('Lj'), None, None]), + ('Lj', [Some('Lj'), None, None]), + ('lj', [Some('Lj'), None, None]), + ('NJ', [Some('Nj'), None, None]), + ('Nj', [Some('Nj'), None, None]), + ('nj', [Some('Nj'), None, None]), + ('DZ', [Some('Dz'), None, None]), + ('Dz', [Some('Dz'), None, None]), + ('dz', [Some('Dz'), None, None]), + ('և', [Some('Ե'), Some('ւ'), None]), + ('ა', [Some('ა'), None, None]), + ('ბ', [Some('ბ'), None, None]), + ('გ', [Some('გ'), None, None]), + ('დ', [Some('დ'), None, None]), + ('ე', [Some('ე'), None, None]), + ('ვ', [Some('ვ'), None, None]), + ('ზ', [Some('ზ'), None, None]), + ('თ', [Some('თ'), None, None]), + ('ი', [Some('ი'), None, None]), + ('კ', [Some('კ'), None, None]), + ('ლ', [Some('ლ'), None, None]), + ('მ', [Some('მ'), None, None]), + ('ნ', [Some('ნ'), None, None]), + ('ო', [Some('ო'), None, None]), + ('პ', [Some('პ'), None, None]), + ('ჟ', [Some('ჟ'), None, None]), + ('რ', [Some('რ'), None, None]), + ('ს', [Some('ს'), None, None]), + ('ტ', [Some('ტ'), None, None]), + ('უ', [Some('უ'), None, None]), + ('ფ', [Some('ფ'), None, None]), + ('ქ', [Some('ქ'), None, None]), + ('ღ', [Some('ღ'), None, None]), + ('ყ', [Some('ყ'), None, None]), + ('შ', [Some('შ'), None, None]), + ('ჩ', [Some('ჩ'), None, None]), + ('ც', [Some('ც'), None, None]), + ('ძ', [Some('ძ'), None, None]), + ('წ', [Some('წ'), None, None]), + ('ჭ', [Some('ჭ'), None, None]), + ('ხ', [Some('ხ'), None, None]), + ('ჯ', [Some('ჯ'), None, None]), + ('ჰ', [Some('ჰ'), None, None]), + ('ჱ', [Some('ჱ'), None, None]), + ('ჲ', [Some('ჲ'), None, None]), + ('ჳ', [Some('ჳ'), None, None]), + ('ჴ', [Some('ჴ'), None, None]), + ('ჵ', [Some('ჵ'), None, None]), + ('ჶ', [Some('ჶ'), None, None]), + ('ჷ', [Some('ჷ'), None, None]), + ('ჸ', [Some('ჸ'), None, None]), + ('ჹ', [Some('ჹ'), None, None]), + ('ჺ', [Some('ჺ'), None, None]), + ('ჽ', [Some('ჽ'), None, None]), + ('ჾ', [Some('ჾ'), None, None]), + ('ჿ', [Some('ჿ'), None, None]), + ('ᾀ', [Some('ᾈ'), None, None]), + ('ᾁ', [Some('ᾉ'), None, None]), + ('ᾂ', [Some('ᾊ'), None, None]), + ('ᾃ', [Some('ᾋ'), None, None]), + ('ᾄ', [Some('ᾌ'), None, None]), + ('ᾅ', [Some('ᾍ'), None, None]), + ('ᾆ', [Some('ᾎ'), None, None]), + ('ᾇ', [Some('ᾏ'), None, None]), + ('ᾈ', [Some('ᾈ'), None, None]), + ('ᾉ', [Some('ᾉ'), None, None]), + ('ᾊ', [Some('ᾊ'), None, None]), + ('ᾋ', [Some('ᾋ'), None, None]), + ('ᾌ', [Some('ᾌ'), None, None]), + ('ᾍ', [Some('ᾍ'), None, None]), + ('ᾎ', [Some('ᾎ'), None, None]), + ('ᾏ', [Some('ᾏ'), None, None]), + ('ᾐ', [Some('ᾘ'), None, None]), + ('ᾑ', [Some('ᾙ'), None, None]), + ('ᾒ', [Some('ᾚ'), None, None]), + ('ᾓ', [Some('ᾛ'), None, None]), + ('ᾔ', [Some('ᾜ'), None, None]), + ('ᾕ', [Some('ᾝ'), None, None]), + ('ᾖ', [Some('ᾞ'), None, None]), + ('ᾗ', [Some('ᾟ'), None, None]), + ('ᾘ', [Some('ᾘ'), None, None]), + ('ᾙ', [Some('ᾙ'), None, None]), + ('ᾚ', [Some('ᾚ'), None, None]), + ('ᾛ', [Some('ᾛ'), None, None]), + ('ᾜ', [Some('ᾜ'), None, None]), + ('ᾝ', [Some('ᾝ'), None, None]), + ('ᾞ', [Some('ᾞ'), None, None]), + ('ᾟ', [Some('ᾟ'), None, None]), + ('ᾠ', [Some('ᾨ'), None, None]), + ('ᾡ', [Some('ᾩ'), None, None]), + ('ᾢ', [Some('ᾪ'), None, None]), + ('ᾣ', [Some('ᾫ'), None, None]), + ('ᾤ', [Some('ᾬ'), None, None]), + ('ᾥ', [Some('ᾭ'), None, None]), + ('ᾦ', [Some('ᾮ'), None, None]), + ('ᾧ', [Some('ᾯ'), None, None]), + ('ᾨ', [Some('ᾨ'), None, None]), + ('ᾩ', [Some('ᾩ'), None, None]), + ('ᾪ', [Some('ᾪ'), None, None]), + ('ᾫ', [Some('ᾫ'), None, None]), + ('ᾬ', [Some('ᾬ'), None, None]), + ('ᾭ', [Some('ᾭ'), None, None]), + ('ᾮ', [Some('ᾮ'), None, None]), + ('ᾯ', [Some('ᾯ'), None, None]), + ('ᾲ', [Some('Ὰ'), Some('ͅ'), None]), + ('ᾳ', [Some('ᾼ'), None, None]), + ('ᾴ', [Some('Ά'), Some('ͅ'), None]), + ('ᾷ', [Some('Α'), Some('͂'), Some('ͅ')]), + ('ᾼ', [Some('ᾼ'), None, None]), + ('ῂ', [Some('Ὴ'), Some('ͅ'), None]), + ('ῃ', [Some('ῌ'), None, None]), + ('ῄ', [Some('Ή'), Some('ͅ'), None]), + ('ῇ', [Some('Η'), Some('͂'), Some('ͅ')]), + ('ῌ', [Some('ῌ'), None, None]), + ('ῲ', [Some('Ὼ'), Some('ͅ'), None]), + ('ῳ', [Some('ῼ'), None, None]), + ('ῴ', [Some('Ώ'), Some('ͅ'), None]), + ('ῷ', [Some('Ω'), Some('͂'), Some('ͅ')]), + ('ῼ', [Some('ῼ'), None, None]), + ('ff', [Some('F'), Some('f'), None]), + ('fi', [Some('F'), Some('i'), None]), + ('fl', [Some('F'), Some('l'), None]), + ('ffi', [Some('F'), Some('f'), Some('i')]), + ('ffl', [Some('F'), Some('f'), Some('l')]), + ('ſt', [Some('S'), Some('t'), None]), + ('st', [Some('S'), Some('t'), None]), + ('ﬓ', [Some('Մ'), Some('ն'), None]), + ('ﬔ', [Some('Մ'), Some('ե'), None]), + ('ﬕ', [Some('Մ'), Some('ի'), None]), + ('ﬖ', [Some('Վ'), Some('ն'), None]), + ('ﬗ', [Some('Մ'), Some('խ'), None]), +]; diff --git a/src/title.rs b/src/title.rs index 2453430..cc22971 100644 --- a/src/title.rs +++ b/src/title.rs @@ -5,7 +5,7 @@ use alloc::{ string::{String, ToString}, }; -use crate::{capitalize, transform}; +use crate::{titlecase, transform}; /// This trait defines a title case conversion. /// @@ -45,7 +45,7 @@ pub struct AsTitleCase>(pub T); impl> fmt::Display for AsTitleCase { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - transform(self.0.as_ref(), capitalize, |f| write!(f, " "), f) + transform(self.0.as_ref(), titlecase, |f| write!(f, " "), f) } } diff --git a/src/train.rs b/src/train.rs index 78d3f0e..a825e56 100644 --- a/src/train.rs +++ b/src/train.rs @@ -2,7 +2,7 @@ use core::fmt; use alloc::{borrow::ToOwned, string::ToString}; -use crate::{capitalize, transform}; +use crate::{titlecase, transform}; /// This trait defines a train case conversion. /// @@ -42,7 +42,7 @@ pub struct AsTrainCase>(pub T); impl> fmt::Display for AsTrainCase { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - transform(self.0.as_ref(), capitalize, |f| write!(f, "-"), f) + transform(self.0.as_ref(), titlecase, |f| write!(f, "-"), f) } } @@ -71,7 +71,7 @@ mod tests { t!(test9: "XΣXΣ baffle" => "Xσxς-Baffle"); t!(test10: "XMLHttpRequest" => "Xml-Http-Request"); t!(test11: "FIELD_NAME11" => "Field-Name11"); - t!(test12: "99BOTTLES" => "99bottles"); + t!(test12: "99BOTTLES" => "99Bottles"); t!(test13: "FieldNamE11" => "Field-Nam-E11"); t!(test14: "abc123def456" => "Abc123def456"); t!(test16: "abc123DEF456" => "Abc123def456"); diff --git a/src/upper_camel.rs b/src/upper_camel.rs index c6f29df..f8980f9 100644 --- a/src/upper_camel.rs +++ b/src/upper_camel.rs @@ -5,7 +5,7 @@ use alloc::{ string::{String, ToString}, }; -use crate::{capitalize, transform}; +use crate::{titlecase, transform}; /// This trait defines an upper camel case conversion. /// @@ -58,7 +58,7 @@ pub struct AsUpperCamelCase>(pub T); impl> fmt::Display for AsUpperCamelCase { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - transform(self.0.as_ref(), capitalize, |_| Ok(()), f) + transform(self.0.as_ref(), titlecase, |_| Ok(()), f) } } diff --git a/tables/src/main.rs b/tables/src/main.rs index 54faa48..dfcb65f 100644 --- a/tables/src/main.rs +++ b/tables/src/main.rs @@ -1,4 +1,7 @@ -use std::{fs::OpenOptions, io::Write}; +use std::{ + fs::OpenOptions, + io::{BufWriter, Write}, +}; mod unicode_data; use unicode_data::data_files; @@ -9,15 +12,18 @@ const UNICODE_VERSION: (u8, u8, u8) = (15, 1, 0); mod allowed_in_word; mod letter_casing; mod nonspacing_marks; +mod titlecase; fn main() -> Result<(), Box> { let data = data_files()?; - let mut out = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open("../src/tables.rs")?; + let mut out = BufWriter::new( + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open("../src/tables.rs")?, + ); writeln!( &mut out, @@ -39,5 +45,7 @@ pub const UNICODE_VERSION: (u8, u8, u8) = {UNICODE_VERSION:?}; nonspacing_marks::write_table(&mut out, &data)?; + titlecase::write_table(&mut out, &data)?; + Ok(()) } diff --git a/tables/src/titlecase.rs b/tables/src/titlecase.rs new file mode 100644 index 0000000..cd7e38d --- /dev/null +++ b/tables/src/titlecase.rs @@ -0,0 +1,186 @@ +use std::{error::Error, io, mem::size_of}; + +use regex::Regex; +use rustc_hash::FxHashMap; + +use crate::unicode_data::DataFiles; + +fn titlecases(data: &DataFiles) -> Vec<(char, Vec)> { + let mut map = FxHashMap::default(); + + // Single character mappings + let regex = Regex::new( + r"^([0-9A-F]+);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);([0-9A-F]*);(?:.*?);([0-9A-F]+)", + ).unwrap(); + for line in data.unicode_data.lines() { + if let Some(captures) = regex.captures(line) { + if let Some(titlecase) = captures.get(3) { + // Only include if different from uppercase + if titlecase.as_str() != &captures[2] { + let cp = + char::from_u32(u32::from_str_radix(&captures[1], 16).unwrap()).unwrap(); + let titlecase_cp = + char::from_u32(u32::from_str_radix(titlecase.as_str(), 16).unwrap()) + .unwrap(); + assert!(!map.contains_key(&cp)); + map.insert(cp, vec![titlecase_cp]); + } + } + } + } + + // Multi character mappings + let regex = + Regex::new(r"^([0-9A-F]+);(?:[0-9A-F ]*);([0-9A-F ]*);([0-9A-F ]*);[^0-9A-Fa-f_]*#") + .unwrap(); + for line in data.special_casing.lines() { + if let Some(captures) = regex.captures(line) { + let titlecase_mapping = captures[2].trim(); + let uppercase_mapping = captures[3].trim(); + if titlecase_mapping != uppercase_mapping { + let cp = char::from_u32(u32::from_str_radix(&captures[1], 16).unwrap()).unwrap(); + assert!(!map.contains_key(&cp)); + map.insert( + cp, + titlecase_mapping + .split_whitespace() + .map(|s| char::from_u32(u32::from_str_radix(s, 16).unwrap()).unwrap()) + .collect(), + ); + } + } + } + + let mut vec: Vec<(char, Vec)> = map.into_iter().collect(); + vec.sort_unstable_by_key(|(c, _)| *c); + vec +} + +pub fn write_table(out: &mut impl io::Write, data: &DataFiles) -> Result<(), Box> { + let titlecase_mappings = titlecases(data); + let max_expansion = titlecase_mappings.iter().map(|t| t.1.len()).max().unwrap(); + + eprintln!( + "titlecase: {} bytes of static data", + (max_expansion + 1) * size_of::() * titlecase_mappings.len() + ); + + writeln!( + out, + " +use core::{{ + fmt::{{self, Write}}, + iter, +}}; + +#[derive(Clone, Debug)] +pub enum ToTitlecase {{ + Zero, + One(char), + Two(char, char), + Three(char, char, char), +}} + +impl Iterator for ToTitlecase {{ + type Item = char; + + fn next(&mut self) -> Option {{ + match *self {{ + Self::Zero => None, + Self::One(c) => {{ + *self = Self::Zero; + Some(c) + }} + Self::Two(b, c) => {{ + *self = Self::One(c); + Some(b) + }} + Self::Three(a, b, c) => {{ + *self = Self::Two(b, c); + Some(a) + }} + }} + }} + + fn size_hint(&self) -> (usize, Option) {{ + let size = match self {{ + Self::Zero => 0, + Self::One(_) => 1, + Self::Two(..) => 2, + Self::Three(..) => 3, + }}; + (size, Some(size)) + }} +}} + +impl iter::ExactSizeIterator for ToTitlecase {{}} + +impl iter::FusedIterator for ToTitlecase {{}} + +impl fmt::Display for ToTitlecase {{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{ + for c in self.clone() {{ + f.write_char(c)?; + }} + Ok(()) + }} +}} + +/// Returns an iterator that yields the titlecase mapping of this `char` as one or more `char`s. +pub fn to_titlecase(c: char) -> ToTitlecase {{ + // ASCII fast path + if c.is_ascii() {{ + ToTitlecase::One(c.to_ascii_uppercase()) + }} else if let Ok(idx) = TITLECASE_MAPPINGS.binary_search_by_key(&c, |&(c2, _)| c2) {{ + match TITLECASE_MAPPINGS[idx].1 {{ + [None, ..] => ToTitlecase::Zero, + [Some(a), None, ..] => ToTitlecase::One(a), + [Some(a), Some(b), None] => ToTitlecase::Two(a, b), + [Some(a), Some(b), Some(c)] => ToTitlecase::Three(a, b, c), + }} + }} else {{ + let mut uppercase = c.to_uppercase(); + match uppercase.size_hint().0 {{ + 0 => ToTitlecase::Zero, + 1 => ToTitlecase::One(uppercase.next().unwrap()), + 2 => ToTitlecase::Two(uppercase.next().unwrap(), uppercase.next().unwrap()), + 3 => ToTitlecase::Three( + uppercase.next().unwrap(), + uppercase.next().unwrap(), + uppercase.next().unwrap(), + ), + _ => unreachable!(), + }} + }} +}} + +/// Sorted list of characters and their titlecase mappings. +/// Only characters whose titlecase differs from uppercase are included. +static TITLECASE_MAPPINGS: [(char, [Option; {max_expansion}]); {}] = [", + titlecase_mappings.len() + )?; + for (c, mapping) in titlecase_mappings { + write!(out, " ('{c}', [")?; + + let mut mapping = mapping.into_iter(); + + if let Some(fc) = mapping.next() { + write!(out, "Some('{fc}')")?; + } else { + write!(out, "None")?; + } + + for _ in 1..max_expansion { + if let Some(c) = mapping.next() { + write!(out, ", Some('{c}')")?; + } else { + write!(out, ", None")?; + } + } + + writeln!(out, "]),")?; + } + writeln!(out, "];")?; + + Ok(()) +} diff --git a/tables/src/unicode_data.rs b/tables/src/unicode_data.rs index d69f064..b4f9cfb 100644 --- a/tables/src/unicode_data.rs +++ b/tables/src/unicode_data.rs @@ -22,6 +22,7 @@ pub struct DataFiles { pub derived_core_properties: String, pub prop_list: String, pub scripts: String, + pub special_casing: String, } /// Retrieve all the data files we need. @@ -31,6 +32,7 @@ pub fn data_files() -> Result> { derived_core_properties: fetch_unicode_file("DerivedCoreProperties.txt")?, prop_list: fetch_unicode_file("PropList.txt")?, scripts: fetch_unicode_file("Scripts.txt")?, + special_casing: fetch_unicode_file("SpecialCasing.txt")?, }) } From 30ea379f8765e516a69328a6b336e7ef0debd121 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 14 Mar 2024 17:04:46 -0400 Subject: [PATCH 4/5] Add tests and update documentation --- README.md | 54 ++++++++++++++++++++++------------ src/lib.rs | 39 ++++++++++++++++++++++-- src/train.rs | 16 +++++++++- tables/src/letter_casing.rs | 7 +++++ tables/src/main.rs | 10 +++---- tables/src/nonspacing_marks.rs | 4 +++ tables/src/titlecase.rs | 3 ++ 7 files changed, 106 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 9b9b4e8..6b77bc1 100644 --- a/README.md +++ b/README.md @@ -8,24 +8,42 @@ consistent, and reasonably well performing. ## Definition of a word boundary -Word boundaries are defined by non-alphanumeric characters, as well as -within those words in this manner: - -1. If an uppercase character is followed by lowercase letters, a word -boundary is considered to be just prior to that uppercase character. -2. If multiple uppercase characters are consecutive, they are considered to -be within a single word, except that the last will be part of the next word -if it is followed by lowercase characters (see rule 1). - -That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is -segmented `XML|Http|Request`. - -Characters not within words (such as spaces, punctuations, and underscores) -are not included in the output string except as they are a part of the case -being converted to. Multiple adjacent word boundaries (such as a series of -underscores) are folded into one. ("hello__world" in snake case is therefore -"hello_world", not the exact same string). Leading or trailing word boundary -indicators are dropped, except insofar as CamelCase capitalizes the first word. +The definition of a word boundary is based on the +[identifier word boundary](https://www.unicode.org/reports/tr55/#Identifier-Chunks) +in Unicode Technical Standard 55. The rules are as follows: + +- The set of characters that can be in a word is + [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Unassigned}\p{Private_Use}-[\p{Punctuation}-\p{Other_Punctuation}]]`][1], + plus U+05F3, U+05F4, and U+0F0B. This notably includes + alphabetic and numeric characters, accents and other combining marks, + emoji, a few mathematical symbols, a few non-word-separating punctuation marks, + unassigned characters, private-use characters, and the asterisk `*`. + +- Characters that cannot be in a word separate words. + For example, `foo_bar` is segmented `foo`|`bar` + because words cannout contain `_`. + These characters will be excluded from the output string. + +- Words cannot be empty. For example, `_foo__bar_` is segmented `foo`|`bar`, + and in snake_case becomes `foo_bar`. + +- There is a word boundary between a lowercase (or non-Greek titlecase) + and an uppercase (or titlecase) letter. For example, `fooBar` is segmented + `foo`|`Bar` because `oB` is a lowercase letter followed by an uppercase letter. + +- An uppercase letter followed by a lowercase letter + has a word boundary before it. For example, `XMLHttpRequest` is segmented + `XML`|`Http`|`Request`; the `Ht` in `HttpRequest` is an uppercase letter + followed by a lowercase letter, so there is a word boundary before it. + + - There is always a word boundary before a non-Greek titlecase letter + (U+01C5 'Dž', U+01C8 'Lj', U+01CB 'Nj', or U+01F2 'Dz'). + + - For the purpose of the preceding three rules, a letter followed + by some number of nonspacing marks (like accents or other diacritics) + is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`. + +[1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BUnassigned%7D%5Cp%7BPrivate_Use%7D-%5B%5Cp%7BPunctuation%7D-%5Cp%7BOther_Punctuation%7D%5D%5D&abb=on ## Cases contained in this library: diff --git a/src/lib.rs b/src/lib.rs index 9aec025..6be6089 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,9 +6,42 @@ //! //! ## Definition of a word boundary //! -//! Word boundaries are defined by the specification of -//! [identifier chunks](https://www.unicode.org/reports/tr55/#Identifier-Chunks) -//! in Unicode Technical Standard 55. +//! The definition of a word boundary is based on the +//! [identifier word boundary](https://www.unicode.org/reports/tr55/#Identifier-Chunks) +//! in Unicode Technical Standard 55. The rules are as follows: +//! +//! - The set of characters that can be in a word is +//! [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Unassigned}\p{Private_Use}-[\p{Punctuation}-\p{Other_Punctuation}]]`][1], +//! plus U+05F3, U+05F4, and U+0F0B. This notably includes +//! alphabetic and numeric characters, accents and other combining marks, +//! emoji, a few mathematical symbols, a few non-word-separating punctuation marks, +//! unassigned characters, and private-use characters. +//! +//! - Characters that cannot be in a word separate words. +//! For example, `foo_bar` is segmented `foo`|`bar` +//! because words cannout contain `_`. +//! These characters will be excluded from the output string. +//! +//! - Words cannot be empty. For example, `_foo__bar_` is segmented `foo`|`bar`, +//! and in snake_case becomes `foo_bar`. +//! +//! - There is a word boundary between a lowercase (or non-Greek titlecase) +//! and an uppercase (or titlecase) letter. For example, `fooBar` is segmented +//! `foo`|`Bar` because `oB` is a lowercase letter followed by an uppercase letter. +//! +//! - An uppercase letter followed by a lowercase letter +//! has a word boundary before it. For example, `XMLHttpRequest` is segmented +//! `XML`|`Http`|`Request`; the `Ht` in `HttpRequest` is an uppercase letter +//! followed by a lowercase letter, so there is a word boundary before it. +//! +//! - There is always a word boundary before a non-Greek titlecase letter +//! (U+01C5 'Dž', U+01C8 'Lj', U+01CB 'Nj', or U+01F2 'Dz'). +//! +//! - For the purpose of the preceding three rules, a letter followed +//! by some number of nonspacing marks (like accents or other diacritics) +//! is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`. +//! +//! [1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BUnassigned%7D%5Cp%7BPrivate_Use%7D-%5B%5Cp%7BPunctuation%7D-%5Cp%7BOther_Punctuation%7D%5D%5D&abb=on //! //! ### Cases contained in this library: //! diff --git a/src/train.rs b/src/train.rs index a825e56..cc2f873 100644 --- a/src/train.rs +++ b/src/train.rs @@ -67,7 +67,6 @@ mod tests { t!(test6: "SHOUTY_SNAKE_CASE" => "Shouty-Snake-Case"); t!(test7: "snake_case" => "Snake-Case"); t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "This-Contains-All-Kinds-Of-Word-Boundaries"); - #[cfg(feature = "unicode")] t!(test9: "XΣXΣ baffle" => "Xσxς-Baffle"); t!(test10: "XMLHttpRequest" => "Xml-Http-Request"); t!(test11: "FIELD_NAME11" => "Field-Name11"); @@ -84,6 +83,21 @@ mod tests { t!(test23: "ABC123dEEf456FOO" => "Abc123d-E-Ef456foo"); t!(test24: "abcDEF" => "Abc-Def"); t!(test25: "ABcDE" => "A-Bc-De"); + t!(test26: "DŽO" => "Džo"); + t!(test27: "džO" => "Dž-O"); + t!(test28: "džo" => "Džo"); + t!(test29: "∇𝐀" => "∇𝐀"); + t!(test30: "∇𝔞" => "∇𝔞"); + t!(test31: "𝔞" => "𝔞"); + t!(test32: "🐈‍⬛🐈" => "\u{200d}"); + t!(test33: "🐈‍⬛🐈a" => "\u{200d}-A"); + t!(test34: "A🐈‍⬛🐈a" => "A-\u{200D}-A"); + t!(test35: "☕" => ""); + t!(test36: "a*️⃣b" => "A-\u{fe0f}-B"); + t!(test37: "a*b" => "A-B"); + t!(test38: "\u{0301}a" => "\u{0301}A"); + t!(test39: "a\u{0301}B" => "A\u{0301}-B"); + t!(test40: "fflololo" => "Fflololo"); t!(uts55_test1: "TypeII" => "Type-Ii"); t!(uts55_test2: "OCaml" => "O-Caml"); diff --git a/tables/src/letter_casing.rs b/tables/src/letter_casing.rs index 5917787..13a3409 100644 --- a/tables/src/letter_casing.rs +++ b/tables/src/letter_casing.rs @@ -1,3 +1,10 @@ +//! Construct a lookup table for the casing status of a letter +//! (lowercase, uppercase, or titlecase). +//! +//! This table only concerns itself with letters: +//! for obtaining the case of characters which are not letters, +//! use the functions from `core`. + use std::{any::type_name, collections::hash_map, error::Error, io, mem::size_of}; use bitvec::prelude::*; diff --git a/tables/src/main.rs b/tables/src/main.rs index dfcb65f..4dd6fea 100644 --- a/tables/src/main.rs +++ b/tables/src/main.rs @@ -3,17 +3,17 @@ use std::{ io::{BufWriter, Write}, }; +mod allowed_in_word; +mod letter_casing; +mod nonspacing_marks; +mod titlecase; mod unicode_data; + use unicode_data::data_files; /// Update this on new Unicode releases const UNICODE_VERSION: (u8, u8, u8) = (15, 1, 0); -mod allowed_in_word; -mod letter_casing; -mod nonspacing_marks; -mod titlecase; - fn main() -> Result<(), Box> { let data = data_files()?; diff --git a/tables/src/nonspacing_marks.rs b/tables/src/nonspacing_marks.rs index 82f8708..5f97bb2 100644 --- a/tables/src/nonspacing_marks.rs +++ b/tables/src/nonspacing_marks.rs @@ -1,3 +1,7 @@ +//! Construct a lookup table to find whether a particular character is a nonspacing mark +//! (general category `Nonspacing_Mark` or `Enclosing_Mark`). +//! These characters are ignored when determining word boundaries. + use std::{ any::type_name, collections::hash_map, diff --git a/tables/src/titlecase.rs b/tables/src/titlecase.rs index cd7e38d..2e7bbcd 100644 --- a/tables/src/titlecase.rs +++ b/tables/src/titlecase.rs @@ -1,3 +1,6 @@ +//! Construct table for titlecase character mappings. +//! Only characters whose titlecase differs from their uppercase are included. + use std::{error::Error, io, mem::size_of}; use regex::Regex; From ce592410f65b43c186d09e1f057775d3434ae8f8 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Fri, 15 Mar 2024 01:02:56 -0400 Subject: [PATCH 5/5] Allow all alphabetic and numeric characters in words Ensures that the new rules are strictly more permissive than the old ones. --- README.md | 6 +- src/lib.rs | 4 +- src/tables.rs | 161 ++++++++++++++++------------------ tables/src/allowed_in_word.rs | 6 +- 4 files changed, 84 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 6b77bc1..b4659c9 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,11 @@ The definition of a word boundary is based on the in Unicode Technical Standard 55. The rules are as follows: - The set of characters that can be in a word is - [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Unassigned}\p{Private_Use}-[\p{Punctuation}-\p{Other_Punctuation}]]`][1], + [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1], plus U+05F3, U+05F4, and U+0F0B. This notably includes alphabetic and numeric characters, accents and other combining marks, emoji, a few mathematical symbols, a few non-word-separating punctuation marks, - unassigned characters, private-use characters, and the asterisk `*`. + unassigned characters, and private-use characters. - Characters that cannot be in a word separate words. For example, `foo_bar` is segmented `foo`|`bar` @@ -43,7 +43,7 @@ in Unicode Technical Standard 55. The rules are as follows: by some number of nonspacing marks (like accents or other diacritics) is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`. -[1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BUnassigned%7D%5Cp%7BPrivate_Use%7D-%5B%5Cp%7BPunctuation%7D-%5Cp%7BOther_Punctuation%7D%5D%5D&abb=on +[1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i= ## Cases contained in this library: diff --git a/src/lib.rs b/src/lib.rs index 6be6089..9ba5a9c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ //! in Unicode Technical Standard 55. The rules are as follows: //! //! - The set of characters that can be in a word is -//! [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Unassigned}\p{Private_Use}-[\p{Punctuation}-\p{Other_Punctuation}]]`][1], +//! [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1], //! plus U+05F3, U+05F4, and U+0F0B. This notably includes //! alphabetic and numeric characters, accents and other combining marks, //! emoji, a few mathematical symbols, a few non-word-separating punctuation marks, @@ -41,7 +41,7 @@ //! by some number of nonspacing marks (like accents or other diacritics) //! is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`. //! -//! [1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BUnassigned%7D%5Cp%7BPrivate_Use%7D-%5B%5Cp%7BPunctuation%7D-%5Cp%7BOther_Punctuation%7D%5D%5D&abb=on +//! [1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i= //! //! ### Cases contained in this library: //! diff --git a/src/tables.rs b/src/tables.rs index 2447b9a..2b21d03 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -31,10 +31,10 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [ 0x2F, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x31, 0x25, 0x00, 0x32, 0x00, 0x00, 0x33, 0x00, 0x34, 0x35, 0x36, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, - 0x43, 0x44, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, - 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x45, 0x46, 0x41, - 0x00, 0x00, 0x00, 0x47, 0x00, 0x48, 0x00, 0x00, 0x41, 0x49, 0x4A, 0x4B, 0x41, 0x41, 0x41, 0x4C, - 0x4D, 0x00, 0x4E, 0x00, 0x00, 0x00, 0x4F, 0x50, 0x51, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x43, 0x44, 0x45, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x46, 0x47, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x48, 0x49, 0x41, + 0x00, 0x00, 0x00, 0x4A, 0x00, 0x19, 0x00, 0x00, 0x4B, 0x4C, 0x4D, 0x4E, 0x41, 0x41, 0x41, 0x4F, + 0x50, 0x00, 0x51, 0x00, 0x00, 0x00, 0x52, 0x53, 0x54, 0x55, 0x56, 0x41, 0x41, 0x41, 0x41, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -63,8 +63,8 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x52, 0x53, 0x00, 0x00, 0x00, 0x00, 0x54, 0x55, 0x00, 0x56, 0x57, 0x00, 0x58, 0x00, - 0x59, 0x5A, 0x00, 0x5B, 0x5C, 0x5D, 0x00, 0x5E, 0x00, 0x5F, 0x00, 0x60, 0x00, 0x61, 0x00, 0x62, + 0x00, 0x00, 0x57, 0x58, 0x00, 0x00, 0x00, 0x00, 0x59, 0x5A, 0x00, 0x5B, 0x5C, 0x00, 0x5D, 0x00, + 0x5E, 0x5F, 0x00, 0x60, 0x61, 0x62, 0x00, 0x63, 0x00, 0x64, 0x00, 0x65, 0x00, 0x66, 0x00, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -84,22 +84,22 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x00, 0x64, 0x65, - 0x00, 0x00, 0x00, 0x00, 0x36, 0x66, 0x00, 0x67, 0x68, 0x69, 0x00, 0x1F, 0x6A, 0x6B, 0x00, 0x6C, - 0x00, 0x00, 0x00, 0x00, 0x6D, 0x6E, 0x6F, 0x70, 0x00, 0x00, 0x00, 0x71, 0x72, 0x00, 0x5D, 0x73, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x75, 0x76, 0x35, 0x77, 0x00, 0x78, 0x79, 0x00, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x00, - 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x82, 0x00, 0x83, 0x84, 0x85, 0x86, - 0x00, 0x87, 0x35, 0x88, 0x00, 0x89, 0x00, 0x8A, 0x8B, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x8C, 0x00, 0x8D, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x8F, 0x90, 0x00, 0x80, 0x00, 0x00, 0x00, - 0x26, 0x00, 0x00, 0x91, 0x00, 0x92, 0x00, 0x15, 0x1F, 0x93, 0x94, 0x00, 0x95, 0x00, 0x00, 0x00, - 0x00, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x00, 0x98, 0x00, 0x99, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x68, 0x00, 0x69, 0x6A, + 0x00, 0x00, 0x00, 0x00, 0x36, 0x6B, 0x00, 0x6C, 0x6D, 0x6E, 0x00, 0x1F, 0x6F, 0x70, 0x00, 0x71, + 0x00, 0x00, 0x00, 0x00, 0x72, 0x73, 0x74, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x76, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x78, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x7B, 0x73, 0x00, 0x7C, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x7E, 0x7F, 0x00, + 0x00, 0x80, 0x35, 0x81, 0x00, 0x82, 0x00, 0x83, 0x84, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x87, 0x00, 0x88, 0x89, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x15, 0x1F, 0x8B, 0x8C, 0x00, 0x8D, 0x00, 0x00, 0x00, + 0x00, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x90, 0x00, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -112,8 +112,8 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x9D, 0x1B, 0x9E, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x15, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x95, 0x96, 0x07, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -133,29 +133,29 @@ static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0xA0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x52, 0x41, 0xA1, - 0x41, 0x41, 0x41, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x41, 0xA7, 0x00, 0xA8, 0x41, 0xA9, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x57, 0x41, 0x99, + 0x41, 0x41, 0x41, 0x46, 0x9A, 0x9B, 0x9C, 0x9D, 0x41, 0x9E, 0x00, 0x00, 0x41, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xAA, 0xAB, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xA0, 0xA1, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0xAE, 0x00, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0xAF, 0xB0, 0x00, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, - 0xB2, 0x41, 0xB3, 0xB4, 0x41, 0x41, 0xB5, 0xB6, 0xB7, 0xB8, 0x00, 0x00, 0x41, 0x41, 0x41, 0x41, - 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xB9, 0x41, 0xBA, 0x41, 0xBB, - 0xBC, 0xBD, 0xBE, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0xBF, 0xC0, 0xC1, 0x41, 0x41, 0xC2, 0xC3, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xA4, 0x00, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, + 0xA6, 0x41, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0x00, 0x00, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xAF, 0x41, 0xB0, 0x41, 0xB1, + 0xB2, 0xB3, 0xB4, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0xB5, 0xB6, 0xB7, 0x41, 0x41, 0xB8, 0x44, ]; -static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [ +static ALLOWED_IN_WORD_LEAVES: [u64; 185] = [ 0xFFFFFFFFFFFFFFFF, 0x03FF000000000000, 0x07FFFFFE07FFFFFE, - 0x06AC040000000000, + 0x76AC040000000000, 0xFF7FFFFFFF7FFFFF, 0x0000501F0003FFC3, 0xBFDFFFFFFFFFFFFF, @@ -175,30 +175,30 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [ 0xFFFFFFFFFFFCFEFF, 0xFFFFFFFBFFFFFFFF, 0xFFFEFFCFFFFFFFFF, - 0xD003FFFFFFFFFFFF, + 0xD3F3FFFFFFFFFFFF, 0xFFFCFFFFFFFFFFFF, - 0xFF02FFFFFFFFFFFF, - 0xF800FFFFFFFFFFFF, - 0x007FFFFFFFFFFFFF, + 0xFFFEFFFFFFFFFFFF, + 0xF807FFFFFFFFFFFF, + 0x7F7FFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFEF, - 0xFC00FFFF80FF7FFF, + 0xFDFFFFFFFFFF7FFF, 0xFFEFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFF3FF7FFF, - 0xC2A003FF03000801, + 0xC2AFFFFF03000801, 0x3FFFFFFFFFFFFFDF, 0xFFFFFFFFF8002040, 0xFFFFFFFFFFFF03FF, 0xFFFFFFFF3FFFFFFF, 0xF7FFFFFFFFFFFFFF, - 0xE003FE00FFFFFFFF, + 0xFFFFFE00FFFFFFFF, 0xFFFFFFFFFC00FFFF, 0xFFFFFFFFFFFFFFFE, 0xFFFF9FFFFFFFFFFF, 0xFFFFFFFFE7FFFFFE, 0xFFFFC7FFFFFFFFFF, 0xFF9FFFFFFFFFFFFF, - 0xFC00FFFFF08FFFFF, + 0xFFFFFFFFF08FFFFF, 0xFFFFFFFFFFFFB800, 0xFFFFFFFFFFFFFFCE, 0x000000003FFFFFFF, @@ -215,25 +215,30 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [ 0x00000000FFFF9FFF, 0xFFFFFFE21FFFFFFE, 0xF3FFFD503F2FFC84, - 0xFFFFFFFF000043E0, - 0x000000000000F1FF, + 0xFFFFFFFFFFFF43E0, + 0x000000000000F3FF, 0x0000000000000000, 0x0000000040000084, 0xFFFFFF8000000000, - 0x00000000FFFFF800, + 0xFFFFFFFFFFFFF800, + 0xFFC000000FFFFFFF, + 0xFFC0000000000000, + 0x00000000000FFFFF, 0x0030000000000000, 0x0000000000400000, - 0x01FFF81FFFFFFFFF, - 0xFFFEFFFFFFFFFFFF, + 0x21FFF81FFFFFFFFF, + 0x0000800000000000, 0xFFFFFFFFC0000000, 0x0000000004000000, 0xFFF0000000000000, 0x0000FFFFFFC00000, 0x1F3EFFFE000000E0, 0xFFFFFFFEFFFFFFFF, - 0xFFFFFFFF0000FFFF, + 0xFFFFFFFF003CFFFF, 0xFFFF7FF000000000, - 0x0000000080000000, + 0x000003FF80000000, + 0x00000000FFFEFF00, + 0xFFFE0000000003FF, 0x000000000000FFFF, 0x3FFFFFFFFFFFFF80, 0xFFFFFFFFFFFF1FFF, @@ -241,7 +246,7 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [ 0xFF03FFFFFFFFFFFF, 0xFFFFFFFCFF800000, 0xFFFFFFFFFFFFF9FF, - 0xFC00F0FFFFFFFFFF, + 0xFC3FF0FFFFFFFFFF, 0xFF0FFFFFFFFFFFFF, 0xE8FFFFFFFFFF3FFF, 0xFFFF3FFFFFFFFFFF, @@ -261,79 +266,64 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [ 0x07FFFFFE03FF0001, 0xFFFFFFE007FFFFFE, 0xC1FF8080FFFFFFFF, - 0x0070000000000078, - 0x001FFFFFFFFFFFFF, - 0xFFFFFFFEE0008000, + 0x007FFFFFFFFFFFF8, + 0x01FFFFFFFFFFFFFF, + 0xFFFFFFFEE0008C00, 0xE00000000000FFFF, - 0xF0000001FFFFFFFF, - 0xFFFFFFF0FFFFFFFF, 0xFFFFFFFFFFFEFFFF, 0xFFFF7FFFFFFFFFFF, - 0x007FFFFF007FFFFF, - 0xFFFF007FFFFFFFFF, - 0x7FFFFFFF703FFFFF, - 0xCFFFFFFFFFFFFFFF, - 0x0000000000030000, - 0x1FFFFFFFFE00FE00, - 0xFFFFFFFF1FFFFFFF, - 0xFF8007FFFFFFFEFF, - 0x01FFFFFFFFFFFFFF, - 0x00FFFFFF00FFFFFF, - 0xFFFF01FFE1FFFFFF, - 0x03FFFFFFFFFFFFFF, - 0x80000000FFFFFFFF, + 0xFE7FFFFFFF7FFFFF, + 0x7FFFFFFF7FFFFFFF, + 0x7FFFFFFFFE00FFFF, + 0xFF80FFFFFFFFFEFF, + 0xFFFFFFFFE1FFFFFF, 0xFFFFDFFFFFFFFFFF, - 0xFFFFFF801FFFFFFF, - 0xFFFFFFFFFC01FFFF, + 0xFFFFFFFFFC1FFFFF, 0xFFFFFFFFFFFFFC3F, - 0xFFFFFFFFFFFFF01F, - 0xFFFFFFC00003C07F, + 0xFFFFFFFFFFFFC07F, 0xFFFFFFFFFFFFDFFC, 0xFFCFFFFFFFFFFFF0, - 0xFFE0000117FFDE1F, + 0xFFFFFFFF17FFDE1F, 0xC0FFFFFFFFFFFFFF, 0xFFFFFFFFD3FF07FF, 0xFFFFFFFFFFFFFFBF, 0xFFFFFFFFFF000001, 0xFFFFE000FFFFFFF1, 0xFDFFFFFFFFFFFFFF, - 0xFFF803FFFFFFFFFF, 0xFFFFFFFFFFFFFF8F, 0xFFFFFFFFFFFFFF80, 0xFFFFFFF823FFFFFF, 0xFFFFFFFFFFFFFC00, - 0xFFFCE00003FFFFC1, + 0xFFFCFFFFFFFFFFC1, 0xFE7FFFFFFFFFFFFF, 0xFFFFFFFFFFFF0007, - 0x7FFC000000000000, + 0x7FFC0000001FFFFF, 0xFFE0FFFFFFFFFFFF, 0xFFF9FFFFFFFFFFFF, 0x0000FFFFFFFFFFFF, 0xFFDFFFFFFFFFFFFF, - 0xFFFFFFFC07FFFFCF, - 0xFFFFFFFFF8000000, + 0x007FFFFFFFFFFFFF, + 0xFFFFFFFFF87FFFFF, 0xFFFFFFF06FFFFFFF, 0xFFFFFFFFFFFFFFF0, - 0xFFC0000000000000, 0x0000018000000000, 0xF807E3E000000000, 0x00003C0000000FE7, 0xFFFFF80000000000, 0xFFFFFFFFFFFFFFDC, - 0xFFF00000FFF00000, - 0xFE000000FF800000, + 0xFFFFFFFFFF800000, 0xF87FFFFFFFFFFFFF, 0x00201FFFFFFFFFFF, 0xFFFFFFFFFFFFF010, 0xFFFFFFFFFFFF7FFF, - 0xFFFFFFFFFFFF007F, - 0x0001FFFFFFFFFFFF, - 0xFFE0000000000000, - 0xC000000000000001, + 0xFFFEEFFFFFFFFFFF, + 0xFFFFBFFFFFFFFFFF, 0x0000F00000000000, 0x00018000FFF00000, 0xFFC0000000010001, - 0xFFFFC00000000000, + 0xFFFF000000001FFF, + 0xFFFF03FFFFFF03FF, + 0xFFFFC000000003FF, 0x0000003FFFFFFFFF, 0xF00000000000FFF8, 0xFFFFFFC0FFFCFE00, @@ -347,7 +337,6 @@ static ALLOWED_IN_WORD_LEAVES: [u64; 196] = [ 0x400000000000FE00, 0xFE00FE00F0003FC0, 0x0000000000080000, - 0xFFFFFFFFFFFFF800, ]; #[cfg(test)] diff --git a/tables/src/allowed_in_word.rs b/tables/src/allowed_in_word.rs index 3c80be6..e89c420 100644 --- a/tables/src/allowed_in_word.rs +++ b/tables/src/allowed_in_word.rs @@ -35,7 +35,7 @@ fn unassigned_private_use(data: &DataFiles) -> CodepointBitArr { } /// `true` for all codepoints that can be part of a word: -/// `[\p{Unassigned}\p{Private_Use}\p{ID_Continue}\p{ID_Compat_Math_Continue}-[\p{Punctuation}-\p{Other_Punctuation}]]`, +/// `[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`, /// plus the extra characters listed below. pub fn allowed_in_word(data: &DataFiles) -> CodepointBitArr { let mut word_component = unassigned_private_use(data); @@ -43,10 +43,12 @@ pub fn allowed_in_word(data: &DataFiles) -> CodepointBitArr { set_by_prop( &mut word_component, &data.derived_core_properties, - "ID_Continue", + "ID_Continue|Alphabetic", true, ); + set_by_general_category(&mut word_component, data, "Nd|Nl|No", true); + set_by_prop( &mut word_component, &data.prop_list,