withoutboats · Jules-Bertholet · Mar 15, 2024 · Mar 14, 2024 · Mar 15, 2024 · Mar 14, 2024
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -33,13 +33,16 @@ jobs:
       uses: actions-rs/cargo@v1
       with:
         command: test
+        args: --all
     - name: Check formatting
       uses: actions-rs/cargo@v1
       with:
         command: fmt
-        args: -- --check
+        args: --all --check
     - name: Catch common mistakes
       uses: actions-rs/cargo@v1
       with:
         command: clippy
-        args: --all-targets -- -D warnings
+        args: --all-targets --workspace -- -D warnings
+    - name: Regenerate Unicode tables
+      run: mv src/tables.rs tables.rs.bak && cd tables && cargo run && cd .. && diff src/tables.rs tables.rs.bak
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,3 +9,6 @@ repository = "https://github.com/withoutboats/heck"
 keywords = ["string", "case", "camel", "snake", "unicode"]
 categories = ["no-std"]
 include = ["src/**/*", "LICENSE-*", "README.md", "CHANGELOG.md"]
+
+[workspace]
+members = ["tables"]
diff --git a/README.md b/README.md
@@ -8,24 +8,42 @@ consistent, and reasonably well performing.
 
 ## Definition of a word boundary
 
-Word boundaries are defined by non-alphanumeric characters, as well as
-within those words in this manner:
-
-1. If an uppercase character is followed by lowercase letters, a word
-boundary is considered to be just prior to that uppercase character.
-2. If multiple uppercase characters are consecutive, they are considered to
-be within a single word, except that the last will be part of the next word
-if it is followed by lowercase characters (see rule 1).
-
-That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is
-segmented `XML|Http|Request`.
-
-Characters not within words (such as spaces, punctuations, and underscores)
-are not included in the output string except as they are a part of the case
-being converted to. Multiple adjacent word boundaries (such as a series of
-underscores) are folded into one. ("hello__world" in snake case is therefore
-"hello_world", not the exact same string). Leading or trailing word boundary
-indicators are dropped, except insofar as CamelCase capitalizes the first word.
+The definition of a word boundary is based on the
+[identifier word boundary](https://www.unicode.org/reports/tr55/#Identifier-Chunks)
+in Unicode Technical Standard 55. The rules are as follows:
+
+- The set of characters that can be in a word is
+  [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1],
+  plus U+05F3, U+05F4, and U+0F0B. This notably includes
+  alphabetic and numeric characters, accents and other combining marks,
+  emoji, a few mathematical symbols, a few non-word-separating punctuation marks,
+  unassigned characters, and private-use characters.
+
+- Characters that cannot be in a word separate words.
+  For example, `foo_bar` is segmented `foo`|`bar`
+  because words cannout contain `_`.
+  These characters will be excluded from the output string.
+
+- Words cannot be empty. For example, `_foo__bar_` is segmented `foo`|`bar`,
+  and in snake_case becomes `foo_bar`.
+
+- There is a word boundary between a lowercase (or non-Greek titlecase)
+  and an uppercase (or titlecase) letter. For example, `fooBar` is segmented
+  `foo`|`Bar` because `oB` is a lowercase letter followed by an uppercase letter.
+
+- An uppercase letter followed by a lowercase letter
+  has a word boundary before it. For example, `XMLHttpRequest` is segmented
+  `XML`|`Http`|`Request`; the `Ht` in `HttpRequest` is an uppercase letter
+  followed by a lowercase letter, so there is a word boundary before it.
+
+ - There is always a word boundary before a non-Greek titlecase letter
+   (U+01C5 'ǅ', U+01C8 'ǈ', U+01CB 'ǋ', or U+01F2 'ǲ').
+
+ - For the purpose of the preceding three rules, a letter followed
+   by some number of nonspacing marks (like accents or other diacritics)
+   is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`.
+
+[1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i=
 
 ## Cases contained in this library:
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -6,25 +6,42 @@
 //!
 //! ## Definition of a word boundary
 //!
-//! Word boundaries are defined by non-alphanumeric characters, as well as
-//! within those words in this manner:
+//! The definition of a word boundary is based on the
+//! [identifier word boundary](https://www.unicode.org/reports/tr55/#Identifier-Chunks)
+//! in Unicode Technical Standard 55. The rules are as follows:
 //!
-//! 1. If an uppercase character is followed by lowercase letters, a word
-//! boundary is considered to be just prior to that uppercase character.
-//! 2. If multiple uppercase characters are consecutive, they are considered to
-//! be within a single word, except that the last will be part of the next word
-//! if it is followed by lowercase characters (see rule 1).
+//! - The set of characters that can be in a word is
+//!   [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1],
+//!   plus U+05F3, U+05F4, and U+0F0B. This notably includes
+//!   alphabetic and numeric characters, accents and other combining marks,
+//!   emoji, a few mathematical symbols, a few non-word-separating punctuation marks,
+//!   unassigned characters, and private-use characters.
 //!
-//! That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is
-//! segmented `XML|Http|Request`.
+//! - Characters that cannot be in a word separate words.
+//!   For example, `foo_bar` is segmented `foo`|`bar`
+//!   because words cannout contain `_`.
+//!   These characters will be excluded from the output string.
 //!
-//! Characters not within words (such as spaces, punctuations, and underscores)
-//! are not included in the output string except as they are a part of the case
-//! being converted to. Multiple adjacent word boundaries (such as a series of
-//! underscores) are folded into one. ("hello__world" in snake case is therefore
-//! "hello_world", not the exact same string). Leading or trailing word boundary
-//! indicators are dropped, except insofar as CamelCase capitalizes the first
-//! word.
+//! - Words cannot be empty. For example, `_foo__bar_` is segmented `foo`|`bar`,
+//!   and in snake_case becomes `foo_bar`.
+//!
+//! - There is a word boundary between a lowercase (or non-Greek titlecase)
+//!   and an uppercase (or titlecase) letter. For example, `fooBar` is segmented
+//!   `foo`|`Bar` because `oB` is a lowercase letter followed by an uppercase letter.
+//!
+//! - An uppercase letter followed by a lowercase letter
+//!   has a word boundary before it. For example, `XMLHttpRequest` is segmented
+//!   `XML`|`Http`|`Request`; the `Ht` in `HttpRequest` is an uppercase letter
+//!   followed by a lowercase letter, so there is a word boundary before it.
+//!
+//!  - There is always a word boundary before a non-Greek titlecase letter
+//!    (U+01C5 'ǅ', U+01C8 'ǈ', U+01CB 'ǋ', or U+01F2 'ǲ').
+//!
+//!  - For the purpose of the preceding three rules, a letter followed
+//!    by some number of nonspacing marks (like accents or other diacritics)
+//!    is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`.
+//!
+//! [1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i=
 //!
 //! ### Cases contained in this library:
 //!
@@ -47,6 +64,7 @@ mod lower_camel;
 mod shouty_kebab;
 mod shouty_snake;
 mod snake;
+mod tables;
 mod title;
 mod train;
 mod upper_camel;
@@ -58,6 +76,7 @@ pub use shouty_snake::{
     AsShoutySnakeCase, AsShoutySnakeCase as AsShoutySnekCase, ToShoutySnakeCase, ToShoutySnekCase,
 };
 pub use snake::{AsSnakeCase, AsSnakeCase as AsSnekCase, ToSnakeCase, ToSnekCase};
+pub use tables::UNICODE_VERSION;
 pub use title::{AsTitleCase, ToTitleCase};
 pub use train::{AsTrainCase, ToTrainCase};
 pub use upper_camel::{
@@ -66,6 +85,8 @@ pub use upper_camel::{
 
 use core::fmt;
 
+use tables::{is_non_greek_titlecase, CasedLetterKind};
+
 fn transform<F, G>(
     s: &str,
     mut with_word: F,
@@ -76,82 +97,100 @@ where
     F: FnMut(&str, &mut fmt::Formatter) -> fmt::Result,
     G: FnMut(&mut fmt::Formatter) -> fmt::Result,
 {
-    /// Tracks the current 'mode' of the transformation algorithm as it scans
-    /// the input string.
-    ///
-    /// The mode is a tri-state which tracks the case of the last cased
-    /// character of the current word. If there is no cased character
-    /// (either lowercase or uppercase) since the previous word boundary,
-    /// than the mode is `Boundary`. If the last cased character is lowercase,
-    /// then the mode is `Lowercase`. Othertherwise, the mode is
-    /// `Uppercase`.
-    #[derive(Clone, Copy, PartialEq)]
-    enum WordMode {
-        /// There have been no lowercase or uppercase characters in the current
-        /// word.
-        Boundary,
-        /// The previous cased character in the current word is lowercase.
-        Lowercase,
-        /// The previous cased character in the current word is uppercase.
-        Uppercase,
-    }
-
     let mut first_word = true;
 
-    for word in s.split(|c: char| !c.is_alphanumeric()) {
-        let mut char_indices = word.char_indices().peekable();
-        let mut init = 0;
-        let mut mode = WordMode::Boundary;
-
-        while let Some((i, c)) = char_indices.next() {
-            if let Some(&(next_i, next)) = char_indices.peek() {
-                // The mode including the current character, assuming the
-                // current character does not result in a word boundary.
-                let next_mode = if c.is_lowercase() {
-                    WordMode::Lowercase
-                } else if c.is_uppercase() {
-                    WordMode::Uppercase
-                } else {
-                    mode
-                };
-
-                // Word boundary after if current is not uppercase and next
-                // is uppercase
-                if next_mode == WordMode::Lowercase && next.is_uppercase() {
-                    if !first_word {
-                        boundary(f)?;
+    for word in s.split(|c: char| !tables::allowed_in_word(c)) {
+        let mut start_of_word_idx = 0;
+        // Whether the previous character seen, ignoring nonspacing marks,
+        // was lowercase or non-Greek titlecase.
+        // Used for determining CamelBoundaries.
+        let mut prev_was_lowercase_or_non_greek_titlecase = false;
+        // If the previous character seen, ignoring nonspacing marks,
+        // was uppercase or titlecase, then this stores that character's index.
+        // Otherwise, it stores `None`.
+        // Used for determining HATBoundaries.
+        let mut index_of_preceding_uppercase_or_titlecase_letter: Option<usize> = None;
+
+        for (i, c) in word.char_indices() {
+            match tables::letter_casing(c) {
+                None => {
+                    // Nonspacing marks are ignored for the purpose of determining boundaries.
+                    if !tables::is_nonspacing_mark(c) {
+                        prev_was_lowercase_or_non_greek_titlecase = false;
+                        index_of_preceding_uppercase_or_titlecase_letter = None;
+                    }
+                }
+                Some(CasedLetterKind::Lowercase) => {
+                    prev_was_lowercase_or_non_greek_titlecase = true;
+                    // There is a HATBoundary before an uppercase or titlecase letter followed by a lowercase letter
+                    if let Some(preceding_idx) = index_of_preceding_uppercase_or_titlecase_letter {
+                        index_of_preceding_uppercase_or_titlecase_letter = None;
+                        if preceding_idx != start_of_word_idx {
+                            if !first_word {
+                                boundary(f)?;
+                            } else {
+                                first_word = false;
+                            }
+                            with_word(&word[start_of_word_idx..preceding_idx], f)?;
+                            start_of_word_idx = preceding_idx;
+                        }
                     }
-                    with_word(&word[init..next_i], f)?;
-                    first_word = false;
-                    init = next_i;
-                    mode = WordMode::Boundary;
-
-                // Otherwise if current and previous are uppercase and next
-                // is lowercase, word boundary before
-                } else if mode == WordMode::Uppercase && c.is_uppercase() && next.is_lowercase() {
-                    if !first_word {
-                        boundary(f)?;
+                }
+                Some(CasedLetterKind::Uppercase) => {
+                    index_of_preceding_uppercase_or_titlecase_letter = Some(i);
+                    // There is a CamelBoundary before an uppercase letter
+                    // that is preceded by a lowercase or non-Greek titlecase letter
+                    if prev_was_lowercase_or_non_greek_titlecase {
+                        prev_was_lowercase_or_non_greek_titlecase = false;
+                        if !first_word {
+                            boundary(f)?;
+                        } else {
+                            first_word = false;
+                        }
+                        with_word(&word[start_of_word_idx..i], f)?;
+                        start_of_word_idx = i;
+                    }
+                }
+                Some(CasedLetterKind::Titlecase) => {
+                    index_of_preceding_uppercase_or_titlecase_letter = Some(i);
+                    // There is always a HATBoundary before a non-Greek titlecase letter
+                    if is_non_greek_titlecase(c) {
+                        prev_was_lowercase_or_non_greek_titlecase = true;
+                        if i != start_of_word_idx {
+                            if !first_word {
+                                boundary(f)?;
+                            } else {
+                                first_word = false;
+                            }
+                            with_word(&word[start_of_word_idx..i], f)?;
+                            start_of_word_idx = i;
+                        }
                     } else {
-                        first_word = false;
+                        // There is a CamelBoundary before a titlecase letter
+                        // that is preceded by a lowercase or non-Greek titlecase letter
+                        if prev_was_lowercase_or_non_greek_titlecase {
+                            prev_was_lowercase_or_non_greek_titlecase = false;
+                            if !first_word {
+                                boundary(f)?;
+                            } else {
+                                first_word = false;
+                            }
+                            with_word(&word[start_of_word_idx..i], f)?;
+                            start_of_word_idx = i;
+                        }
                     }
-                    with_word(&word[init..i], f)?;
-                    init = i;
-                    mode = WordMode::Boundary;
-
-                // Otherwise no word boundary, just update the mode
-                } else {
-                    mode = next_mode;
                 }
+            }
+        }
+
+        if start_of_word_idx != word.len() {
+            // Collect trailing characters as a word
+            if !first_word {
+                boundary(f)?;
             } else {
-                // Collect trailing characters as a word
-                if !first_word {
-                    boundary(f)?;
-                } else {
-                    first_word = false;
-                }
-                with_word(&word[init..], f)?;
-                break;
+                first_word = false;
             }
+            with_word(&word[start_of_word_idx..], f)?;
         }
     }
 
@@ -179,13 +218,25 @@ fn uppercase(s: &str, f: &mut fmt::Formatter) -> fmt::Result {
     Ok(())
 }
 
-fn capitalize(s: &str, f: &mut fmt::Formatter) -> fmt::Result {
-    let mut char_indices = s.char_indices();
-    if let Some((_, c)) = char_indices.next() {
-        write!(f, "{}", c.to_uppercase())?;
-        if let Some((i, _)) = char_indices.next() {
-            lowercase(&s[i..], f)?;
+fn titlecase(s: &str, f: &mut fmt::Formatter) -> fmt::Result {
+    // Find the first cased character
+    if let Some(titlecase_idx) =
+        s.find(|c| tables::letter_casing(c).is_some() || c.is_lowercase() || c.is_uppercase())
+    {
+        // Everything before the first cased character is passed through unchanged.
+        f.write_str(&s[..titlecase_idx])?;
+
+        let rem = &s[titlecase_idx..];
+        let mut char_indices = rem.char_indices();
+        if let Some((_, c)) = char_indices.next() {
+            write!(f, "{}", tables::to_titlecase(c))?;
+            if let Some((i, _)) = char_indices.next() {
+                lowercase(&rem[i..], f)?;
+            }
         }
+    } else {
+        // If there are no cased characters, pass through the string unchanged
+        write!(f, "{}", s)?;
     }
 
     Ok(())

diff --git a/src/lower_camel.rs b/src/lower_camel.rs
@@ -5,7 +5,7 @@ use alloc::{
     string::{String, ToString},
 };
 
-use crate::{capitalize, lowercase, transform};
+use crate::{lowercase, titlecase, transform};
 
 /// This trait defines a lower camel case conversion.
 ///
@@ -53,7 +53,7 @@ impl<T: AsRef<str>> fmt::Display for AsLowerCamelCase<T> {
                     first = false;
                     lowercase(s, f)
                 } else {
-                    capitalize(s, f)
+                    titlecase(s, f)
                 }
             },
             |_| Ok(()),