Improvements to grapheme truncation for Utf32Str(ing)

This commit corrects the internal handling of grapheme truncation. Most notably, it fixes two bugs with the previous implementation of Utf32Str(ing): 1. Fixes a bug where an Ascii variant could have been returned even though the original string was not ASCII. (The converse, where a Unicode variant consists only of ASCII, is totally fine). 2. Fixes the handling of windows-style newline (i.e. `\r\n`) since these are single graphemes. Moreover, the `\r\n` grapheme is now mapped to `\n` rather than `\r`. In particular, Utf32Str(ing)s constructed from text containing windows-style newlines will result in Unicode variants, even if the string is entirely valid Ascii.
helix-editor · Dec 7, 2024 · 36d3747 · 36d3747
1 parent 474edc7
commit 36d3747
Show file tree

Hide file tree

Showing 3 changed files with 154 additions and 30 deletions.
diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs
@@ -189,10 +189,17 @@ pub(crate) enum CharClass {
 pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
     #[cfg(feature = "unicode-segmentation")]
     let res = text.graphemes(true).map(|grapheme| {
-        grapheme
-            .chars()
-            .next()
-            .expect("graphemes must be non-empty")
+        // we need to special-case this check since `\r\n` is a single grapheme and is
+        // therefore the exception to the rule that normalization of a grapheme should
+        // map to the first character.
+        if grapheme == "\r\n" {
+            '\n'
+        } else {
+            grapheme
+                .chars()
+                .next()
+                .expect("graphemes must be non-empty")
+        }
     });
     #[cfg(not(feature = "unicode-segmentation"))]
     let res = text.chars();

diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs
@@ -1,56 +1,115 @@
+#[cfg(test)]
+mod tests;
+
 use std::borrow::Cow;
 use std::ops::{Bound, RangeBounds};
 use std::{fmt, slice};
 
+use memchr::memmem;
+
 use crate::chars;
 
-/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
+/// Check if a given string can be represented internally as the `Ascii` variant in a
+/// [`Utf32String`] or a [`Utf32Str`].
+///
+/// This returns true if the string is ASCII and does not contain a windows-style newline
+/// `'\r'`.
+/// The additional carriage return check is required since even for strings consisting only
+/// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme.
+#[inline]
+fn has_ascii_graphemes(string: &str) -> bool {
+    string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none()
+}
+
+/// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching.
+///
+/// This is mostly intended as an internal string type, but some methods are exposed for
+/// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string
+/// using one of its `From<T>` constructors for string types `T` or from the
+/// [`Utf32Str::new`] method.
+///
+/// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string
+///    slice.
+/// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical
+///    to the original string.
+/// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string.
+///
+/// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these
+/// assumptions when handling `Utf32Str(ing)`s of unknown origin.
+///
+/// ## Caveats
+/// Despite the name, this type is quite far from being a true string type. Here are some
+/// examples demonstrating this.
 ///
-/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
-/// operates on codepoints (it should operate on graphemes but that's too much
-/// hassle to deal with). We want to quickly iterate these codepoints between
-/// (up to 5 times) during matching.
+/// ### String conversions are not round-trip
+/// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u +
+/// COMBINING_DIARESIS`), the trailing codepoints are truncated.
+/// ```
+/// # use nucleo_matcher::Utf32String;
+/// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u");
+/// ```
+///
+/// ### Indexing is done by grapheme
+/// Indexing into a string is done by grapheme rather than by codepoint.
+/// ```
+/// # use nucleo_matcher::Utf32String;
+/// assert!(Utf32String::from("au\u{0308}").len() == 2);
+/// ```
+///
+/// ### A `Unicode` variant may be produced by all-ASCII characters.
+/// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme,
+/// strings containing `\r\n` will still result in a `Unicode` variant.
+/// ```
+/// # use nucleo_matcher::Utf32String;
+/// let s = Utf32String::from("\r\n");
+/// assert!(!s.slice(..).is_ascii());
+/// assert!(s.len() == 1);
+/// assert!(s.slice(..).get(0) == '\n');
+/// ```
+///
+/// ## Design rationale
+/// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching
+/// operates on codepoints (ideally, it should operate on graphemes but that's too
+/// much hassle to deal with), we want to quickly iterate over codepoints (up to 5
+/// times) during matching.
 ///
 /// Doing codepoint segmentation on the fly not only blows trough the cache
-/// (lookuptables and Icache) but also has nontrivial runtime compared to the
-/// matching itself. Furthermore there are a lot of exta optimizations available
-/// for ascii only text (but checking during each match has too much overhead).
+/// (lookup tables and I-cache) but also has nontrivial runtime compared to the
+/// matching itself. Furthermore there are many extra optimizations available
+/// for ASCII only text, but checking each match has too much overhead.
 ///
-/// Ofcourse this comes at exta memory cost as we usually still need the ut8
-/// encoded variant for rendering. In the (dominant) case of ascii-only text
+/// Of course, this comes at extra memory cost as we usually still need the UTF-8
+/// encoded variant for rendering. In the (dominant) case of ASCII-only text
 /// we don't require a copy. Furthermore fuzzy matching usually is applied while
 /// the user is typing on the fly so the same item is potentially matched many
-/// times (making the the upfront cost more worth it). That means that its
-/// basically always worth it to presegment the string.
+/// times (making the the up-front cost more worth it). That means that its
+/// basically always worth it to pre-segment the string.
 ///
 /// For usecases that only match (a lot of) strings once its possible to keep
-/// char buffer around that is filled with the presegmented chars
+/// char buffer around that is filled with the presegmented chars.
 ///
 /// Another advantage of this approach is that the matcher will naturally
-/// produce char indices (instead of utf8 offsets) anyway. With a
+/// produce grapheme indices (instead of utf8 offsets) anyway. With a
 /// codepoint basic representation like this the indices can be used
 /// directly
 #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub enum Utf32Str<'a> {
     /// A string represented as ASCII encoded bytes.
-    /// Correctness invariant: must only contain valid ASCII (<=127)
+    /// Correctness invariant: must only contain valid ASCII (`<= 127`)
     Ascii(&'a [u8]),
     /// A string represented as an array of unicode codepoints (basically UTF-32).
     Unicode(&'a [char]),
 }
 
 impl<'a> Utf32Str<'a> {
-    /// Convenience method to construct a `Utf32Str` from a normal utf8 str
+    /// Convenience method to construct a `Utf32Str` from a normal UTF-8 str
     pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
-        if str.is_ascii() {
+        if has_ascii_graphemes(str) {
             Utf32Str::Ascii(str.as_bytes())
         } else {
             buf.clear();
             buf.extend(crate::chars::graphemes(str));
-            if buf.iter().all(|c| c.is_ascii()) {
-                return Utf32Str::Ascii(str.as_bytes());
-            }
-            Utf32Str::Unicode(&*buf)
+            Utf32Str::Unicode(buf)
         }
     }
 
@@ -107,7 +166,7 @@ impl<'a> Utf32Str<'a> {
         }
     }
 
-    /// Returns the number of leading whitespaces in this string
+    /// Returns the number of trailing whitespaces in this string
     #[inline]
     pub(crate) fn trailing_white_space(self) -> usize {
         match self {
@@ -144,25 +203,36 @@ impl<'a> Utf32Str<'a> {
         }
     }
 
-    /// Returns whether this string only contains ascii text.
+    /// Returns whether this string only contains graphemes which are single ASCII chars.
+    ///
+    /// This is almost equivalent to the string being ASCII, except with the additional requirement
+    /// that the string cannot contain a windows-style newline `\r\n` which is treated as a single
+    /// grapheme.
     pub fn is_ascii(self) -> bool {
         matches!(self, Utf32Str::Ascii(_))
     }
 
-    /// Returns the `n`th character in this string.
+    /// Returns the `n`th character in this string, zero-indexed
     pub fn get(self, n: u32) -> char {
         match self {
             Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
             Utf32Str::Unicode(codepoints) => codepoints[n as usize],
         }
     }
+
+    /// Returns the last character in this string.
+    ///
+    /// Panics if the string is empty.
     pub(crate) fn last(self) -> char {
         match self {
             Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
             Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
         }
     }
 
+    /// Returns the first character in this string.
+    ///
+    /// Panics if the string is empty.
     pub(crate) fn first(self) -> char {
         match self {
             Utf32Str::Ascii(bytes) => bytes[0] as char,
@@ -204,6 +274,7 @@ pub enum Chars<'a> {
     Ascii(slice::Iter<'a, u8>),
     Unicode(slice::Iter<'a, char>),
 }
+
 impl Iterator for Chars<'_> {
     type Item = char;
 
@@ -226,6 +297,8 @@ impl DoubleEndedIterator for Chars<'_> {
 
 #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
 /// An owned version of [`Utf32Str`].
+///
+/// See the API documentation for [`Utf32Str`] for more detail.
 pub enum Utf32String {
     /// A string represented as ASCII encoded bytes.
     /// Correctness invariant: must only contain valid ASCII (<=127)
@@ -307,7 +380,7 @@ impl Utf32String {
 impl From<&str> for Utf32String {
     #[inline]
     fn from(value: &str) -> Self {
-        if value.is_ascii() {
+        if has_ascii_graphemes(value) {
             Self::Ascii(value.to_owned().into_boxed_str())
         } else {
             Self::Unicode(chars::graphemes(value).collect())
@@ -317,7 +390,7 @@ impl From<&str> for Utf32String {
 
 impl From<Box<str>> for Utf32String {
     fn from(value: Box<str>) -> Self {
-        if value.is_ascii() {
+        if has_ascii_graphemes(&value) {
             Self::Ascii(value)
         } else {
             Self::Unicode(chars::graphemes(&value).collect())

diff --git a/matcher/src/utf32_str/tests.rs b/matcher/src/utf32_str/tests.rs
@@ -0,0 +1,44 @@
+use crate::{Utf32Str, Utf32String};
+
+#[test]
+fn test_utf32str_ascii() {
+    /// Helper function for testing
+    fn expect_ascii(src: &str, is_ascii: bool) {
+        let mut buffer = Vec::new();
+        assert!(Utf32Str::new(src, &mut buffer).is_ascii() == is_ascii);
+        assert!(Utf32String::from(src).slice(..).is_ascii() == is_ascii);
+        assert!(Utf32String::from(src.to_owned()).slice(..).is_ascii() == is_ascii);
+    }
+
+    // ascii
+    expect_ascii("", true);
+    expect_ascii("a", true);
+    expect_ascii("a\nb", true);
+    expect_ascii("\n\r", true);
+
+    // not ascii
+    expect_ascii("aü", false);
+    expect_ascii("au\u{0308}", false);
+
+    // windows-style newline
+    expect_ascii("a\r\nb", false);
+    expect_ascii("ü\r\n", false);
+    expect_ascii("\r\n", false);
+}
+
+#[test]
+fn test_grapheme_truncation() {
+    // ascii is preserved
+    let s = Utf32String::from("ab");
+    assert_eq!(s.slice(..).get(0), 'a');
+    assert_eq!(s.slice(..).get(1), 'b');
+
+    // windows-style newline is truncated to '\n'
+    let s = Utf32String::from("\r\n");
+    assert_eq!(s.slice(..).get(0), '\n');
+
+    // normal graphemes are truncated to the first character
+    let s = Utf32String::from("u\u{0308}\r\n");
+    assert_eq!(s.slice(..).get(0), 'u');
+    assert_eq!(s.slice(..).get(1), '\n');
+}