Skip to content

Commit

Permalink
Minor doc fixes in nucleo_matcher
Browse files Browse the repository at this point in the history
  • Loading branch information
alexrutar committed Dec 13, 2024
1 parent d9d707d commit f16469b
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 18 deletions.
26 changes: 17 additions & 9 deletions matcher/src/chars.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
//! Utilities for working with (unicode) characters/codepoints
//! Utilities for working with (Unicode) characters and codepoints.
use std::fmt::{self, Debug, Display};

#[cfg(feature = "unicode-casefold")]
use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
use crate::Config;

//autogenerated by generate-ucd
// autogenerated by generate-ucd
#[allow(warnings)]
#[rustfmt::skip]
#[cfg(feature = "unicode-casefold")]
Expand Down Expand Up @@ -82,6 +82,7 @@ impl Char for AsciiChar {
self
}
}

fn char_class_non_ascii(c: char) -> CharClass {
if c.is_lowercase() {
CharClass::Lower
Expand All @@ -97,6 +98,7 @@ fn char_class_non_ascii(c: char) -> CharClass {
CharClass::NonWord
}
}

impl Char for char {
const ASCII: bool = false;
#[inline(always)]
Expand Down Expand Up @@ -149,7 +151,7 @@ pub use normalize::normalize;
#[cfg(feature = "unicode-segmentation")]
use unicode_segmentation::UnicodeSegmentation;

/// Converts a character to lower case using simple unicode case folding
/// Converts a character to lower case using simple Unicode case folding.
#[cfg(feature = "unicode-casefold")]
#[inline(always)]
pub fn to_lower_case(c: char) -> char {
Expand All @@ -158,8 +160,9 @@ pub fn to_lower_case(c: char) -> char {
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
}

/// Checks if a character is upper case according to simple unicode case folding.
/// if the `unicode-casefold` feature is disable the equivalent std function is used
/// Checks if a character is upper case according to simple Unicode case folding.
///
/// If the `unicode-casefold` feature is disabled, the equivalent std function is used instead.
#[inline(always)]
pub fn is_upper_case(c: char) -> bool {
#[cfg(feature = "unicode-casefold")]
Expand All @@ -182,10 +185,15 @@ pub(crate) enum CharClass {
Number,
}

/// Nucleo cannot match graphemes as single units. To work around
/// that we only use the first codepoint of each grapheme. This
/// iterator returns the first character of each unicode grapheme
/// in a string and is used for constructing `Utf32Str(ing)`.
/// Returns an iterator over single-codepoint representations of each grapheme in the provided
/// text.
///
/// For the most part, this is simply the first `char` of a grapheme. The main exception is the
/// windows-style newline `\r\n`, which is normalized to the char `'\n'`.
///
/// This workaround mainly exists since Nucleo cannot match graphemes as single units, so we
/// must internally map each grapheme to a simpler in-memory representation. This method is used
/// when constructing `Utf32Str(ing)`.
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
#[cfg(feature = "unicode-segmentation")]
let res = text.graphemes(true).map(|grapheme| {
Expand Down
6 changes: 3 additions & 3 deletions matcher/src/chars/normalize.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
/// Normalize a Unicode character by converting Latin characters which are variants
/// of ASCII characters to their latin equivalent.
/// of ASCII characters to their Latin equivalents.
///
/// Note that this method acts on single `char`s: if you want to perform full normalization, you
/// should first split on graphemes, and then normalize each grapheme by normalizing the first
/// `char` in the grapheme.
/// `char` in each grapheme. See the [`graphemes`](super::graphemes) function for more detail.
///
/// If a character does not normalize to a single ASCII character, no normalization is performed.
///
Expand All @@ -15,7 +15,7 @@
/// - [Latin Extended Additional](https://en.wikipedia.org/wiki/Latin_Extended_Additional)
/// - [Superscripts and Subscripts](https://en.wikipedia.org/wiki/Superscripts_and_Subscripts)
///
/// If the character does not fall in this block, it is not normalized.
/// If the character does not fall in any of these blocks, it is not normalized.
///
/// # Example
/// ```
Expand Down
4 changes: 2 additions & 2 deletions matcher/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ pub struct Config {
}

impl Config {
/// The default config for nucleo, implemented as a constant since
/// Default::default can not be called in a const context
/// The default configuration for nucleo, implemented as a constant since
/// [`Default::default`] cannot be called in a `const` context.
pub const DEFAULT: Self = {
Config {
delimiter_chars: b"/,:;|",
Expand Down
8 changes: 5 additions & 3 deletions matcher/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,11 @@ impl Default for Matcher {
}

impl Matcher {
/// Creates a new matcher instance, note that this will eagerly allocate a
/// fairly large chunk of heap memory (around 135KB currently but subject to
/// change) so matchers should be reused if called often (like in a loop).
/// Creates a new matcher instance.
///
/// This will eagerly allocate a fairly large chunk of heap memory (around 135KB
/// currently, but subject to change) so matchers should be reused if called often,
/// such as in a loop.
pub fn new(config: Config) -> Self {
Self {
config,
Expand Down
4 changes: 3 additions & 1 deletion matcher/src/utf32_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ fn has_ascii_graphemes(string: &str) -> bool {
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
pub enum Utf32Str<'a> {
/// A string represented as ASCII encoded bytes.
///
/// Correctness invariant: must only contain valid ASCII (`<= 127`)
Ascii(&'a [u8]),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Expand Down Expand Up @@ -301,7 +302,8 @@ impl DoubleEndedIterator for Chars<'_> {
/// See the API documentation for [`Utf32Str`] for more detail.
pub enum Utf32String {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain valid ASCII (<=127)
///
/// Correctness invariant: must only contain valid ASCII (`<= 127`)
Ascii(Box<str>),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(Box<[char]>),
Expand Down

0 comments on commit f16469b

Please sign in to comment.