-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathpattern.rs
566 lines (538 loc) · 20.5 KB
/
pattern.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
//! This module provides a slightly higher level API for matching strings.
use std::cmp::Reverse;
use crate::{chars, Matcher, Utf32Str};
#[cfg(test)]
mod tests;
use crate::Utf32String;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
#[non_exhaustive]
/// How to treat a case mismatch between two characters.
pub enum CaseMatching {
/// Characters never match their case folded version (`a != A`).
#[cfg_attr(not(feature = "unicode-casefold"), default)]
Respect,
/// Characters always match their case folded version (`a == A`).
#[cfg(feature = "unicode-casefold")]
Ignore,
/// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
/// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
#[default]
#[cfg(feature = "unicode-casefold")]
Smart,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
#[non_exhaustive]
/// How to handle unicode normalization,
pub enum Normalization {
/// Characters never match their normalized version (`a != ä`).
#[cfg_attr(not(feature = "unicode-normalization"), default)]
Never,
/// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
/// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
#[default]
#[cfg(feature = "unicode-normalization")]
Smart,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
/// The kind of matching algorithm to run for an atom.
pub enum AtomKind {
/// Fuzzy matching where the needle must match any haystack characters
/// (match can contain gaps). This atom kind is used by default if no
/// special syntax is used. There is no negated fuzzy matching (too
/// many false positives).
///
/// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
Fuzzy,
/// The needle must match a contiguous sequence of haystack characters
/// without gaps. This atom kind is parsed from the following syntax:
/// `'foo` and `!foo` (negated).
///
/// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
Substring,
/// The needle must match all leading haystack characters without gaps or
/// prefix. This atom kind is parsed from the following syntax: `^foo` and
/// `!^foo` (negated).
///
/// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
Prefix,
/// The needle must match all trailing haystack characters without gaps or
/// postfix. This atom kind is parsed from the following syntax: `foo$` and
/// `!foo$` (negated).
///
/// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
Postfix,
/// The needle must match all haystack characters without gaps or prefix.
/// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
/// (negated).
///
/// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
Exact,
}
/// A single pattern component that is matched with a single [`Matcher`] function
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Atom {
/// Whether this pattern atom is a negative match.
/// A negative pattern atom will prevent haystacks matching it from
/// being matchend. It does not contribute to scoring/indices
pub negative: bool,
/// The kind of match that this pattern performs
pub kind: AtomKind,
needle: Utf32String,
ignore_case: bool,
normalize: bool,
}
impl Atom {
/// Creates a single [`Atom`] from a string by performing unicode
/// normalization and case folding (if necessary). Optionally `\ ` can
/// be escaped to ` `.
pub fn new(
needle: &str,
case: CaseMatching,
normalize: Normalization,
kind: AtomKind,
escape_whitespace: bool,
) -> Atom {
Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
}
fn new_inner(
needle: &str,
case: CaseMatching,
normalization: Normalization,
kind: AtomKind,
escape_whitespace: bool,
append_dollar: bool,
) -> Atom {
let mut ignore_case;
let mut normalize;
#[cfg(feature = "unicode-normalization")]
{
normalize = matches!(normalization, Normalization::Smart);
}
#[cfg(not(feature = "unicode-normalization"))]
{
normalize = false;
}
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
let mut needle = start.to_owned();
for rem in rem.split("\\ ") {
needle.push(' ');
needle.push_str(rem);
}
needle
} else {
needle.to_owned()
}
} else {
needle.to_owned()
};
match case {
#[cfg(feature = "unicode-casefold")]
CaseMatching::Ignore => {
ignore_case = true;
needle.make_ascii_lowercase()
}
#[cfg(feature = "unicode-casefold")]
CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => ignore_case = false,
}
if append_dollar {
needle.push('$');
}
Utf32String::Ascii(needle.into_boxed_str())
} else {
let mut needle_ = Vec::with_capacity(needle.len());
#[cfg(feature = "unicode-casefold")]
{
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
}
#[cfg(not(feature = "unicode-casefold"))]
{
ignore_case = false;
}
#[cfg(feature = "unicode-normalization")]
{
normalize = matches!(normalization, Normalization::Smart);
}
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
if saw_backslash {
if c == ' ' {
needle_.push(' ');
saw_backslash = false;
continue;
} else {
needle_.push('\\');
}
}
saw_backslash = c == '\\';
match case {
#[cfg(feature = "unicode-casefold")]
CaseMatching::Ignore => c = chars::to_lower_case(c),
#[cfg(feature = "unicode-casefold")]
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c)
}
CaseMatching::Respect => (),
}
match normalization {
#[cfg(feature = "unicode-normalization")]
Normalization::Smart => {
normalize = normalize && chars::normalize(c) == c;
}
Normalization::Never => (),
}
needle_.push(c);
}
} else {
let chars = chars::graphemes(needle).map(|mut c| {
match case {
#[cfg(feature = "unicode-casefold")]
CaseMatching::Ignore => c = chars::to_lower_case(c),
#[cfg(feature = "unicode-casefold")]
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c);
}
CaseMatching::Respect => (),
}
match normalization {
#[cfg(feature = "unicode-normalization")]
Normalization::Smart => {
normalize = normalize && chars::normalize(c) == c;
}
Normalization::Never => (),
}
c
});
needle_.extend(chars);
};
if append_dollar {
needle_.push('$');
}
Utf32String::Unicode(needle_.into_boxed_slice())
};
Atom {
kind,
needle,
negative: false,
ignore_case,
normalize,
}
}
/// Parse a pattern atom from a string. Some special trailing and leading
/// characters can be used to control the atom kind. See [`AtomKind`] for
/// details.
pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
let mut atom = raw;
let invert = match atom.as_bytes() {
[b'!', ..] => {
atom = &atom[1..];
true
}
[b'\\', b'!', ..] => {
atom = &atom[1..];
false
}
_ => false,
};
let mut kind = match atom.as_bytes() {
[b'^', ..] => {
atom = &atom[1..];
AtomKind::Prefix
}
[b'\'', ..] => {
atom = &atom[1..];
AtomKind::Substring
}
[b'\\', b'^' | b'\'', ..] => {
atom = &atom[1..];
AtomKind::Fuzzy
}
_ => AtomKind::Fuzzy,
};
let mut append_dollar = false;
match atom.as_bytes() {
[.., b'\\', b'$'] => {
append_dollar = true;
atom = &atom[..atom.len() - 2]
}
[.., b'$'] => {
kind = if kind == AtomKind::Fuzzy {
AtomKind::Postfix
} else {
AtomKind::Exact
};
atom = &atom[..atom.len() - 1]
}
_ => (),
}
if invert && kind == AtomKind::Fuzzy {
kind = AtomKind::Substring
}
let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
pattern.negative = invert;
pattern
}
/// Matches this pattern against `haystack` (using the allocation and configuration
/// from `matcher`) and calculates a ranking score. See the [`Matcher`].
/// Documentation for more details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
matcher.config.normalize = self.normalize;
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
};
if self.negative {
if pattern_score.is_some() {
return None;
}
Some(0)
} else {
pattern_score
}
}
/// Matches this pattern against `haystack` (using the allocation and
/// configuration from `matcher`), calculates a ranking score and the match
/// indices. See the [`Matcher`]. Documentation for more
/// details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
///
/// *Note:* The `indices` vector is not cleared by this function.
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
matcher.config.normalize = self.normalize;
if self.negative {
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
};
pattern_score.is_none().then_some(0)
} else {
match self.kind {
AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
AtomKind::Substring => {
matcher.substring_indices(haystack, self.needle.slice(..), indices)
}
AtomKind::Prefix => {
matcher.prefix_indices(haystack, self.needle.slice(..), indices)
}
AtomKind::Postfix => {
matcher.postfix_indices(haystack, self.needle.slice(..), indices)
}
}
}
}
/// Returns the needle text that is passed to the matcher. All indices
/// produced by the `indices` functions produce char indices used to index
/// this text
pub fn needle_text(&self) -> Utf32Str<'_> {
self.needle.slice(..)
}
/// Convenience function to easily match (and sort) a (relatively small)
/// list of inputs.
///
/// *Note* This function is not recommended for building a full fuzzy
/// matching application that can match large numbers of matches (like all
/// files in a directory) as all matching is done on the current thread,
/// effectively blocking the UI. For such applications the high level
/// `nucleo` crate can be used instead.
pub fn match_list<T: AsRef<str>>(
&self,
items: impl IntoIterator<Item = T>,
matcher: &mut Matcher,
) -> Vec<(T, u16)> {
if self.needle.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}
}
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
let mut saw_backslash = false;
pattern.split(move |c| {
saw_backslash = match c {
c if c.is_whitespace() && !saw_backslash => return true,
'\\' => true,
_ => false,
};
false
})
}
#[derive(Debug, Default)]
/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
#[non_exhaustive]
pub struct Pattern {
/// The individual pattern (words) in this pattern
pub atoms: Vec<Atom>,
}
impl Pattern {
/// Creates a pattern where each word is matched individually (whitespaces
/// can be escaped with `\`). Otherwise no parsing is performed (so `$`, `!`,
/// `'` and `^` don't receive special treatment). If you want to match the entire
/// pattern as a single needle use a single [`Atom`] instead.
pub fn new(
pattern: &str,
case_matching: CaseMatching,
normalize: Normalization,
kind: AtomKind,
) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::new(pat, case_matching, normalize, kind, true);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Pattern { atoms }
}
/// Creates a pattern where each word is matched individually (whitespaces
/// can be escaped with `\`). And `$`, `!`, `'` and `^` at word boundaries will
/// cause different matching behaviour (see [`AtomKind`]). These can be
/// escaped with backslash.
pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::parse(pat, case_matching, normalize);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Pattern { atoms }
}
/// Convenience function to easily match (and sort) a (relatively small)
/// list of inputs.
///
/// *Note* This function is not recommended for building a full fuzzy
/// matching application that can match large numbers of matches (like all
/// files in a directory) as all matching is done on the current thread,
/// effectively blocking the UI. For such applications the high level
/// `nucleo` crate can be used instead.
pub fn match_list<T: AsRef<str>>(
&self,
items: impl IntoIterator<Item = T>,
matcher: &mut Matcher,
) -> Vec<(T, u32)> {
if self.atoms.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}
/// Matches this pattern against `haystack` (using the allocation and configuration
/// from `matcher`) and calculates a ranking score. See the [`Matcher`]
/// documentation for more details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
score += pattern.score(haystack, matcher)? as u32;
}
Some(score)
}
/// Matches this pattern against `haystack` (using the allocation and
/// configuration from `matcher`), calculates a ranking score and the match
/// indices. See the [`Matcher`] documentation for more
/// details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
///
/// *Note:* The indices for each pattern are calculated individually
/// and simply appended to the `indices` vector and not deduplicated/sorted.
/// This allows associating the match indices to their source pattern. If
/// required (like for highlighting) unique/sorted indices can be obtained
/// as follows:
///
/// ```
/// # let mut indices: Vec<u32> = Vec::new();
/// indices.sort_unstable();
/// indices.dedup();
/// ```
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
score += pattern.indices(haystack, matcher, indices)? as u32;
}
Some(score)
}
/// Refreshes this pattern by reparsing it from a string. This is mostly
/// equivalent to just constructing a new pattern using [`Pattern::parse`]
/// but is slightly more efficient by reusing some allocations
pub fn reparse(
&mut self,
pattern: &str,
case_matching: CaseMatching,
normalize: Normalization,
) {
self.atoms.clear();
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = Atom::parse(atom, case_matching, normalize);
if atom.needle.is_empty() {
return None;
}
Some(atom)
});
self.atoms.extend(atoms);
}
}
impl Clone for Pattern {
fn clone(&self) -> Self {
Self {
atoms: self.atoms.clone(),
}
}
fn clone_from(&mut self, source: &Self) {
self.atoms.clone_from(&source.atoms);
}
}