Skip to content

Commit

Permalink
Improvements to grapheme truncation for Utf32Str(ing)
Browse files Browse the repository at this point in the history
This commit corrects the internal handling of grapheme truncation. Most
notably, it fixes two bugs with the previous implementation of
Utf32Str(ing):

1. Fixes a bug where an Ascii variant could have been returned even
   though the original string was not ASCII. (The converse, where a
   Unicode variant consists only of ASCII, is totally fine).
2. Fixes the handling of windows-style newline (i.e. `\r\n`)
   since these are single graphemes. Moreover, the `\r\n` grapheme is
   now mapped to `\n` rather than `\r`. In particular, Utf32Str(ing)s
   constructed from text containing windows-style newlines will result
   in Unicode variants, even if the string is entirely valid Ascii.
  • Loading branch information
alexrutar committed Dec 7, 2024
1 parent 474edc7 commit 36d3747
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 30 deletions.
15 changes: 11 additions & 4 deletions matcher/src/chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,17 @@ pub(crate) enum CharClass {
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
#[cfg(feature = "unicode-segmentation")]
let res = text.graphemes(true).map(|grapheme| {
grapheme
.chars()
.next()
.expect("graphemes must be non-empty")
// we need to special-case this check since `\r\n` is a single grapheme and is
// therefore the exception to the rule that normalization of a grapheme should
// map to the first character.
if grapheme == "\r\n" {
'\n'
} else {
grapheme
.chars()
.next()
.expect("graphemes must be non-empty")
}
});
#[cfg(not(feature = "unicode-segmentation"))]
let res = text.chars();
Expand Down
125 changes: 99 additions & 26 deletions matcher/src/utf32_str.rs
Original file line number Diff line number Diff line change
@@ -1,56 +1,115 @@
#[cfg(test)]
mod tests;

use std::borrow::Cow;
use std::ops::{Bound, RangeBounds};
use std::{fmt, slice};

use memchr::memmem;

use crate::chars;

/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
/// Check if a given string can be represented internally as the `Ascii` variant in a
/// [`Utf32String`] or a [`Utf32Str`].
///
/// This returns true if the string is ASCII and does not contain a windows-style newline
/// `'\r'`.
/// The additional carriage return check is required since even for strings consisting only
/// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme.
#[inline]
fn has_ascii_graphemes(string: &str) -> bool {
string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none()
}

/// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching.
///
/// This is mostly intended as an internal string type, but some methods are exposed for
/// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string
/// using one of its `From<T>` constructors for string types `T` or from the
/// [`Utf32Str::new`] method.
///
/// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string
/// slice.
/// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical
/// to the original string.
/// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string.
///
/// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these
/// assumptions when handling `Utf32Str(ing)`s of unknown origin.
///
/// ## Caveats
/// Despite the name, this type is quite far from being a true string type. Here are some
/// examples demonstrating this.
///
/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
/// operates on codepoints (it should operate on graphemes but that's too much
/// hassle to deal with). We want to quickly iterate these codepoints between
/// (up to 5 times) during matching.
/// ### String conversions are not round-trip
/// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u +
/// COMBINING_DIARESIS`), the trailing codepoints are truncated.

Check warning on line 46 in matcher/src/utf32_str.rs

View workflow job for this annotation

GitHub Actions / Typos

"DIARESIS" should be "DIAERESIS".
/// ```
/// # use nucleo_matcher::Utf32String;
/// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u");
/// ```
///
/// ### Indexing is done by grapheme
/// Indexing into a string is done by grapheme rather than by codepoint.
/// ```
/// # use nucleo_matcher::Utf32String;
/// assert!(Utf32String::from("au\u{0308}").len() == 2);
/// ```
///
/// ### A `Unicode` variant may be produced by all-ASCII characters.
/// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme,
/// strings containing `\r\n` will still result in a `Unicode` variant.
/// ```
/// # use nucleo_matcher::Utf32String;
/// let s = Utf32String::from("\r\n");
/// assert!(!s.slice(..).is_ascii());
/// assert!(s.len() == 1);
/// assert!(s.slice(..).get(0) == '\n');
/// ```
///
/// ## Design rationale
/// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching
/// operates on codepoints (ideally, it should operate on graphemes but that's too
/// much hassle to deal with), we want to quickly iterate over codepoints (up to 5
/// times) during matching.
///
/// Doing codepoint segmentation on the fly not only blows trough the cache
/// (lookuptables and Icache) but also has nontrivial runtime compared to the
/// matching itself. Furthermore there are a lot of exta optimizations available
/// for ascii only text (but checking during each match has too much overhead).
/// (lookup tables and I-cache) but also has nontrivial runtime compared to the
/// matching itself. Furthermore there are many extra optimizations available
/// for ASCII only text, but checking each match has too much overhead.
///
/// Ofcourse this comes at exta memory cost as we usually still need the ut8
/// encoded variant for rendering. In the (dominant) case of ascii-only text
/// Of course, this comes at extra memory cost as we usually still need the UTF-8
/// encoded variant for rendering. In the (dominant) case of ASCII-only text
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
/// the user is typing on the fly so the same item is potentially matched many
/// times (making the the upfront cost more worth it). That means that its
/// basically always worth it to presegment the string.
/// times (making the the up-front cost more worth it). That means that its
/// basically always worth it to pre-segment the string.
///
/// For usecases that only match (a lot of) strings once its possible to keep
/// char buffer around that is filled with the presegmented chars
/// char buffer around that is filled with the presegmented chars.
///
/// Another advantage of this approach is that the matcher will naturally
/// produce char indices (instead of utf8 offsets) anyway. With a
/// produce grapheme indices (instead of utf8 offsets) anyway. With a
/// codepoint basic representation like this the indices can be used
/// directly
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
pub enum Utf32Str<'a> {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain valid ASCII (<=127)
/// Correctness invariant: must only contain valid ASCII (`<= 127`)
Ascii(&'a [u8]),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(&'a [char]),
}

impl<'a> Utf32Str<'a> {
/// Convenience method to construct a `Utf32Str` from a normal utf8 str
/// Convenience method to construct a `Utf32Str` from a normal UTF-8 str
pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
if str.is_ascii() {
if has_ascii_graphemes(str) {
Utf32Str::Ascii(str.as_bytes())
} else {
buf.clear();
buf.extend(crate::chars::graphemes(str));
if buf.iter().all(|c| c.is_ascii()) {
return Utf32Str::Ascii(str.as_bytes());
}
Utf32Str::Unicode(&*buf)
Utf32Str::Unicode(buf)
}
}

Expand Down Expand Up @@ -107,7 +166,7 @@ impl<'a> Utf32Str<'a> {
}
}

/// Returns the number of leading whitespaces in this string
/// Returns the number of trailing whitespaces in this string
#[inline]
pub(crate) fn trailing_white_space(self) -> usize {
match self {
Expand Down Expand Up @@ -144,25 +203,36 @@ impl<'a> Utf32Str<'a> {
}
}

/// Returns whether this string only contains ascii text.
/// Returns whether this string only contains graphemes which are single ASCII chars.
///
/// This is almost equivalent to the string being ASCII, except with the additional requirement
/// that the string cannot contain a windows-style newline `\r\n` which is treated as a single
/// grapheme.
pub fn is_ascii(self) -> bool {
matches!(self, Utf32Str::Ascii(_))
}

/// Returns the `n`th character in this string.
/// Returns the `n`th character in this string, zero-indexed
pub fn get(self, n: u32) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
Utf32Str::Unicode(codepoints) => codepoints[n as usize],
}
}

/// Returns the last character in this string.
///
/// Panics if the string is empty.
pub(crate) fn last(self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
}
}

/// Returns the first character in this string.
///
/// Panics if the string is empty.
pub(crate) fn first(self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[0] as char,
Expand Down Expand Up @@ -204,6 +274,7 @@ pub enum Chars<'a> {
Ascii(slice::Iter<'a, u8>),
Unicode(slice::Iter<'a, char>),
}

impl Iterator for Chars<'_> {
type Item = char;

Expand All @@ -226,6 +297,8 @@ impl DoubleEndedIterator for Chars<'_> {

#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
/// An owned version of [`Utf32Str`].
///
/// See the API documentation for [`Utf32Str`] for more detail.
pub enum Utf32String {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain valid ASCII (<=127)
Expand Down Expand Up @@ -307,7 +380,7 @@ impl Utf32String {
impl From<&str> for Utf32String {
#[inline]
fn from(value: &str) -> Self {
if value.is_ascii() {
if has_ascii_graphemes(value) {
Self::Ascii(value.to_owned().into_boxed_str())
} else {
Self::Unicode(chars::graphemes(value).collect())
Expand All @@ -317,7 +390,7 @@ impl From<&str> for Utf32String {

impl From<Box<str>> for Utf32String {
fn from(value: Box<str>) -> Self {
if value.is_ascii() {
if has_ascii_graphemes(&value) {
Self::Ascii(value)
} else {
Self::Unicode(chars::graphemes(&value).collect())
Expand Down
44 changes: 44 additions & 0 deletions matcher/src/utf32_str/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
use crate::{Utf32Str, Utf32String};

#[test]
fn test_utf32str_ascii() {
/// Helper function for testing
fn expect_ascii(src: &str, is_ascii: bool) {
let mut buffer = Vec::new();
assert!(Utf32Str::new(src, &mut buffer).is_ascii() == is_ascii);
assert!(Utf32String::from(src).slice(..).is_ascii() == is_ascii);
assert!(Utf32String::from(src.to_owned()).slice(..).is_ascii() == is_ascii);
}

// ascii
expect_ascii("", true);
expect_ascii("a", true);
expect_ascii("a\nb", true);
expect_ascii("\n\r", true);

// not ascii
expect_ascii("aü", false);
expect_ascii("au\u{0308}", false);

// windows-style newline
expect_ascii("a\r\nb", false);
expect_ascii(\r\n", false);
expect_ascii("\r\n", false);
}

#[test]
fn test_grapheme_truncation() {
// ascii is preserved
let s = Utf32String::from("ab");
assert_eq!(s.slice(..).get(0), 'a');
assert_eq!(s.slice(..).get(1), 'b');

// windows-style newline is truncated to '\n'
let s = Utf32String::from("\r\n");
assert_eq!(s.slice(..).get(0), '\n');

// normal graphemes are truncated to the first character
let s = Utf32String::from("u\u{0308}\r\n");
assert_eq!(s.slice(..).get(0), 'u');
assert_eq!(s.slice(..).get(1), '\n');
}

0 comments on commit 36d3747

Please sign in to comment.