diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index d13a246..3c0e61c 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -1,4 +1,4 @@ -//! Utilities for working with (unicode) characters/codepoints +//! Utilities for working with (Unicode) characters and codepoints. use std::fmt::{self, Debug, Display}; @@ -6,7 +6,7 @@ use std::fmt::{self, Debug, Display}; use crate::chars::case_fold::CASE_FOLDING_SIMPLE; use crate::Config; -//autogenerated by generate-ucd +// autogenerated by generate-ucd #[allow(warnings)] #[rustfmt::skip] #[cfg(feature = "unicode-casefold")] @@ -82,6 +82,7 @@ impl Char for AsciiChar { self } } + fn char_class_non_ascii(c: char) -> CharClass { if c.is_lowercase() { CharClass::Lower @@ -97,6 +98,7 @@ fn char_class_non_ascii(c: char) -> CharClass { CharClass::NonWord } } + impl Char for char { const ASCII: bool = false; #[inline(always)] @@ -149,7 +151,7 @@ pub use normalize::normalize; #[cfg(feature = "unicode-segmentation")] use unicode_segmentation::UnicodeSegmentation; -/// Converts a character to lower case using simple unicode case folding +/// Converts a character to lower case using simple Unicode case folding. #[cfg(feature = "unicode-casefold")] #[inline(always)] pub fn to_lower_case(c: char) -> char { @@ -158,8 +160,9 @@ pub fn to_lower_case(c: char) -> char { .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) } -/// Checks if a character is upper case according to simple unicode case folding. -/// if the `unicode-casefold` feature is disable the equivalent std function is used +/// Checks if a character is upper case according to simple Unicode case folding. +/// +/// If the `unicode-casefold` feature is disabled, the equivalent std function is used instead. #[inline(always)] pub fn is_upper_case(c: char) -> bool { #[cfg(feature = "unicode-casefold")] @@ -182,10 +185,15 @@ pub(crate) enum CharClass { Number, } -/// Nucleo cannot match graphemes as single units. To work around -/// that we only use the first codepoint of each grapheme. This -/// iterator returns the first character of each unicode grapheme -/// in a string and is used for constructing `Utf32Str(ing)`. +/// Returns an iterator over single-codepoint representations of each grapheme in the provided +/// text. +/// +/// For the most part, this is simply the first `char` of a grapheme. The main exception is the +/// windows-style newline `\r\n`, which is normalized to the char `'\n'`. +/// +/// This workaround mainly exists since Nucleo cannot match graphemes as single units, so we +/// must internally map each grapheme to a simpler in-memory representation. This method is used +/// when constructing `Utf32Str(ing)`. pub fn graphemes(text: &str) -> impl Iterator + '_ { #[cfg(feature = "unicode-segmentation")] let res = text.graphemes(true).map(|grapheme| { diff --git a/matcher/src/chars/normalize.rs b/matcher/src/chars/normalize.rs index 3de501a..6b4bc9d 100644 --- a/matcher/src/chars/normalize.rs +++ b/matcher/src/chars/normalize.rs @@ -1,9 +1,9 @@ /// Normalize a Unicode character by converting Latin characters which are variants -/// of ASCII characters to their latin equivalent. +/// of ASCII characters to their Latin equivalents. /// /// Note that this method acts on single `char`s: if you want to perform full normalization, you /// should first split on graphemes, and then normalize each grapheme by normalizing the first -/// `char` in the grapheme. +/// `char` in each grapheme. See the [`graphemes`](super::graphemes) function for more detail. /// /// If a character does not normalize to a single ASCII character, no normalization is performed. /// @@ -15,7 +15,7 @@ /// - [Latin Extended Additional](https://en.wikipedia.org/wiki/Latin_Extended_Additional) /// - [Superscripts and Subscripts](https://en.wikipedia.org/wiki/Superscripts_and_Subscripts) /// -/// If the character does not fall in this block, it is not normalized. +/// If the character does not fall in any of these blocks, it is not normalized. /// /// # Example /// ``` diff --git a/matcher/src/config.rs b/matcher/src/config.rs index 39dc202..4712eb7 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -29,8 +29,8 @@ pub struct Config { } impl Config { - /// The default config for nucleo, implemented as a constant since - /// Default::default can not be called in a const context + /// The default configuration for nucleo, implemented as a constant since + /// [`Default::default`] cannot be called in a `const` context. pub const DEFAULT: Self = { Config { delimiter_chars: b"/,:;|", diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 6623e82..1ca7c8c 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -183,9 +183,11 @@ impl Default for Matcher { } impl Matcher { - /// Creates a new matcher instance, note that this will eagerly allocate a - /// fairly large chunk of heap memory (around 135KB currently but subject to - /// change) so matchers should be reused if called often (like in a loop). + /// Creates a new matcher instance. + /// + /// This will eagerly allocate a fairly large chunk of heap memory (around 135KB + /// currently, but subject to change) so matchers should be reused if called often, + /// such as in a loop. pub fn new(config: Config) -> Self { Self { config, diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 664dae7..a366cf8 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -95,6 +95,7 @@ fn has_ascii_graphemes(string: &str) -> bool { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] pub enum Utf32Str<'a> { /// A string represented as ASCII encoded bytes. + /// /// Correctness invariant: must only contain valid ASCII (`<= 127`) Ascii(&'a [u8]), /// A string represented as an array of unicode codepoints (basically UTF-32). @@ -301,7 +302,8 @@ impl DoubleEndedIterator for Chars<'_> { /// See the API documentation for [`Utf32Str`] for more detail. pub enum Utf32String { /// A string represented as ASCII encoded bytes. - /// Correctness invariant: must only contain valid ASCII (<=127) + /// + /// Correctness invariant: must only contain valid ASCII (`<= 127`) Ascii(Box), /// A string represented as an array of unicode codepoints (basically UTF-32). Unicode(Box<[char]>),