From 2c5c8e58f6cc17c6966cee55dec65eecb73f79ff Mon Sep 17 00:00:00 2001 From: Alex Povel Date: Mon, 29 May 2023 20:27:32 +0200 Subject: [PATCH] refactor: Use `unicode_titlecase`'s `to_titlecase_lower_rest`, remove own implementation Another nice win. See also #13 --- Cargo.lock | 8 ++++ common/Cargo.toml | 1 + common/src/strings.rs | 40 ++----------------- core/Cargo.toml | 1 + core/src/stages/german/driver.rs | 7 ++-- .../snapshots/cli__german-001_german.snap | 2 +- 6 files changed, 19 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4506b491..d70f8280 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,7 @@ dependencies = [ "paste", "rstest", "serde", + "unicode_titlecase", ] [[package]] @@ -289,6 +290,7 @@ dependencies = [ "paste", "rstest", "serde", + "unicode_titlecase", ] [[package]] @@ -1137,6 +1139,12 @@ version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +[[package]] +name = "unicode_titlecase" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e18f7720914194b7b39299e3bb21acca39a52437f1e4bcc3ab24961dcb1b662" + [[package]] name = "utf8parse" version = "0.2.1" diff --git a/common/Cargo.toml b/common/Cargo.toml index 477a2caf..2823d086 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -11,6 +11,7 @@ log = "0.4.17" itertools = "0.10.5" paste = "1.0.12" serde = { version = "1.0.163", features = ["derive"] } +unicode_titlecase = "2.0.0" [dev-dependencies] rstest = "0.17.0" diff --git a/common/src/strings.rs b/common/src/strings.rs index db8731cc..b086117c 100644 --- a/common/src/strings.rs +++ b/common/src/strings.rs @@ -1,23 +1,5 @@ use log::trace; - -pub fn titlecase(word: &str) -> String { - let mut chars = word.chars(); - let mut result = String::with_capacity(word.len()); - - if let Some(c) = chars.next() { - for upper in c.to_uppercase() { - result.push(upper); - } - } - - for c in chars { - for lower in c.to_lowercase() { - result.push(lower); - } - } - - result -} +use unicode_titlecase::StrTitleCase; pub fn is_compound_word(word: &str, predicate: &impl Fn(&str) -> bool) -> bool { trace!("Checking if word is valid compound word: '{}'", word); @@ -47,9 +29,10 @@ pub fn is_compound_word(word: &str, predicate: &impl Fn(&str) -> bool) -> bool { suffix ); - predicate(&titlecase(suffix)) + let tc = suffix.to_titlecase_lower_rest(); + predicate(&tc) || predicate(suffix) - || is_compound_word(&titlecase(suffix), predicate) + || is_compound_word(&tc, predicate) || is_compound_word(suffix, predicate) } None => false, @@ -61,21 +44,6 @@ mod tests { use super::*; use rstest::rstest; - #[rstest] - #[case("hello", "Hello")] - #[case("bItTe", "Bitte")] - #[case("dANKE", "Danke")] - #[case("übel", "Übel")] - #[case("uebel", "Uebel")] - #[case("😀", "😀")] - #[case("ßuper", "SSuper")] - #[case("ẞuperduper", "ẞuperduper")] - #[case("WOW!!", "Wow!!")] - #[case("ẞß", "ẞß")] - fn test_titlecase(#[case] word: &str, #[case] expected: &str) { - assert_eq!(titlecase(word), expected); - } - const WORDS: &[&str] = &["Süßwasser", "schwimm", "Bäder", "Mauer", "Dübel", "Kübel"]; #[rstest] diff --git a/core/Cargo.toml b/core/Cargo.toml index ad688645..204268b7 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -24,6 +24,7 @@ env_logger = "0.10.0" itertools = "0.10.5" log = "0.4.17" common = { path = "../common" } +unicode_titlecase = "2.0.0" [features] default = ["all"] diff --git a/core/src/stages/german/driver.rs b/core/src/stages/german/driver.rs index 5760c7c2..918a16a8 100644 --- a/core/src/stages/german/driver.rs +++ b/core/src/stages/german/driver.rs @@ -9,9 +9,10 @@ use crate::stages::{ use cached::proc_macro::cached; use cached::SizedCache; use common::lookup::binary_search_uneven; -use common::strings::{is_compound_word, titlecase}; +use common::strings::is_compound_word; use itertools::Itertools; use log::{debug, trace}; +use unicode_titlecase::StrTitleCase; static VALID_GERMAN_WORDS: &str = include_str!(concat!(env!("OUT_DIR"), "/de.txt")); // Generated in `build.rs`. @@ -410,7 +411,7 @@ fn is_valid(word: &str, predicate: &impl Fn(&str) -> bool) -> bool { } Ok(WordCasing::AllUppercase) => { // Convert to something sensible before proceeding. - let tc = titlecase(word); + let tc = word.to_titlecase_lower_rest(); debug_assert!( WordCasing::try_from(tc.as_str()) == Ok(WordCasing::Titlecase), "Titlecased word, but isn't categorized correctly." @@ -423,7 +424,7 @@ fn is_valid(word: &str, predicate: &impl Fn(&str) -> bool) -> bool { // treatment. match word.chars().next() { Some(c) if c.is_uppercase() => { - let tc = titlecase(word); + let tc = word.to_titlecase_lower_rest(); debug_assert!( WordCasing::try_from(tc.as_str()) == Ok(WordCasing::Titlecase), "Titlecased word, but isn't categorized correctly." diff --git a/core/tests/snapshots/cli__german-001_german.snap b/core/tests/snapshots/cli__german-001_german.snap index 4a638f18..3bcf1ebb 100644 --- a/core/tests/snapshots/cli__german-001_german.snap +++ b/core/tests/snapshots/cli__german-001_german.snap @@ -9,7 +9,7 @@ Zwölf Boxkämpfer jagen Viktor quer über den großen Sylter Deich. Vogel Quax zwickt Johnys Pferd Bim. Sylvia wagt quick den Jux bei Pforzheim. Polyfon zwitschernd aßen Maexchens Vögel Rüben, Joghurt und Quark. -"Fix, Schwyz!" quäkt Jürgen blöd vom Paß. +"Fix, Schwyz!" quäkt Jürgen blöd vom Pass. Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. Falsches Üben von Xylophonmusik quält jeden größeren Zwerg. Heizölrückstoßabdämpfung.