From 918d4a0cddce47f9fd69cda2ad328f206597ba90 Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Fri, 27 Oct 2017 10:50:06 -0400 Subject: [PATCH] search: skip dfa for anchored pats with captures The DFA can't produce captures, but is still faster than the Pike VM NFA, so the normal approach to finding capture groups is to look for the entire match with the DFA and then run the NFA on the substring of the input that matched. In cases where the regex in anchored, the match always starts at the beginning of the input, so there is never any point to trying the DFA first. The DFA can still be useful for rejecting inputs which are not in the language of the regular expression, but anchored regex with capture groups are most commonly used in a parsing context, so it seems like a fair trade-off. Fixes #348 --- bench/src/bench.rs | 35 ++++++++++++++++++++ bench/src/misc.rs | 82 ++++++++++++++++++++++++++++++++++++++++++++++ src/exec.rs | 14 +++++--- 3 files changed, 126 insertions(+), 5 deletions(-) diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 6ddadec8c9..319ea5f7a8 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -225,6 +225,41 @@ macro_rules! bench_find { } } +// USAGE: bench_captures!(name, pattern, groups, haystack); +// +// CONTRACT: +// Given: +// ident, the desired benchmarking function name +// pattern : ::Regex, the regular expression to be executed +// groups : usize, the number of capture groups +// haystack : String, the string to search +// bench_captures will benchmark how fast re.captures() produces +// the capture groups in question. +macro_rules! bench_captures { + ($name:ident, $pattern:expr, $count:expr, $haystack:expr) => { + + #[cfg(feature = "re-rust")] + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + + lazy_static! { + static ref RE: Mutex = Mutex::new($pattern); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + match re.captures(&text) { + None => assert!(false, "no captures"), + Some(caps) => assert_eq!($count + 1, caps.len()), + } + }); + } + } +} + mod ffi; mod misc; mod regexdna; diff --git a/bench/src/misc.rs b/bench/src/misc.rs index edb274e9c7..08d07a30ad 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -190,3 +190,85 @@ macro_rules! reallyhard2 { () => (r"\w+\s+Holmes") } bench_match!(reallyhard2_1K, reallyhard2!(), get_text(TXT_1K, reallyhard2_suffix())); + + +// +// Benchmarks to justify the short-haystack NFA fallthrough optimization +// implemented by `read_captures_at` in regex/src/exec.rs. See github issue +// #348. +// +// The procedure used to try to determine the right hardcoded cutoff +// for the short-haystack optimization in issue #348 is as follows. +// +// ``` +// > cd bench +// > cargo bench --features re-rust short_hay | tee dfa-nfa.res +// > # modify the `MatchType::Dfa` branch in exec.rs:read_captures_at +// > # to just execute the nfa +// > cargo bench --features re-rust short_hay | tee nfa-only.res +// > cargo benchcmp dfa-nfa.res nfa-only.res +// ``` +// +// The expected result is that short inputs will go faster under +// the nfa-only mode, but at some turnover point the dfa-nfa mode +// will start to win again. Unfortunately, that is not what happened. +// Instead there was no noticeable change in the bench results, so +// I've opted to just do the more conservative anchor optimization. +// +bench_captures!(short_haystack_1x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + String::from("aaaabbbbccccbbbdddd")); +bench_captures!(short_haystack_2x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(2).collect::(), + repeat("dddd").take(2).collect::(), + )); +bench_captures!(short_haystack_3x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(3).collect::(), + repeat("dddd").take(3).collect::(), + )); +bench_captures!(short_haystack_4x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(4).collect::(), + repeat("dddd").take(4).collect::(), + )); +bench_captures!(short_haystack_10x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(10).collect::(), + repeat("dddd").take(10).collect::(), + )); +bench_captures!(short_haystack_100x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(100).collect::(), + repeat("dddd").take(100).collect::(), + )); +bench_captures!(short_haystack_1000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(1000).collect::(), + repeat("dddd").take(1000).collect::(), + )); +bench_captures!(short_haystack_10000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(10000).collect::(), + repeat("dddd").take(10000).collect::(), + )); +bench_captures!(short_haystack_100000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(100000).collect::(), + repeat("dddd").take(100000).collect::(), + )); +bench_captures!(short_haystack_1000000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(1000000).collect::(), + repeat("dddd").take(1000000).collect::(), + )); diff --git a/src/exec.rs b/src/exec.rs index 458e47d3b0..d12a725cf0 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -554,12 +554,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> { }) } MatchType::Dfa => { - match self.find_dfa_forward(text, start) { - dfa::Result::Match((s, e)) => { - self.captures_nfa_with_match(slots, text, s, e) + if self.ro.nfa.is_anchored_start { + self.captures_nfa(slots, text, start) + } else { + match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => { + self.captures_nfa_with_match(slots, text, s, e) + } + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), } - dfa::Result::NoMatch(_) => None, - dfa::Result::Quit => self.captures_nfa(slots, text, start), } } MatchType::DfaAnchoredReverse => {