Skip to content

Commit

Permalink
Don't use dfa for anchored strings with captures
Browse files Browse the repository at this point in the history
The DFA can't produce captures, but is still faster
than the Pike VM NFA, so the normal approach to finding
capture groups is to look for the entire match with the
DFA and then run the NFA on the substring of the input
that matched. In cases where the regex in anchored, the
match always starts at the beginning of the input, so
there is never any point to trying the DFA first.

The DFA can still be useful for rejecting inputs which
are not in the language of the regular expression, but
anchored regex with capture groups are most commonly
used in a parsing context, so it seems like a fair trade-off.

For a more in depth discussion see github issue #348.
  • Loading branch information
Ethan Pailes committed Oct 27, 2017
1 parent 57426f6 commit 526bc6b
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 5 deletions.
35 changes: 35 additions & 0 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,41 @@ macro_rules! bench_find {
}
}

// USAGE: bench_captures!(name, pattern, groups, haystack);
//
// CONTRACT:
// Given:
// ident, the desired benchmarking function name
// pattern : ::Regex, the regular expression to be executed
// groups : usize, the number of capture groups
// haystack : String, the string to search
// bench_captures will benchmark how fast re.captures() produces
// the capture groups in question.
macro_rules! bench_captures {
($name:ident, $pattern:expr, $count:expr, $haystack:expr) => {

#[cfg(not(feature = "re-pcre1"))]
#[bench]
fn $name(b: &mut Bencher) {
use std::sync::Mutex;

lazy_static! {
static ref RE: Mutex<Regex> = Mutex::new($pattern);
static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
};
let re = RE.lock().unwrap();
let text = TEXT.lock().unwrap();
b.bytes = text.len() as u64;
b.iter(|| {
match re.captures(&text) {
None => assert!(false, "no captures"),
Some(caps) => assert_eq!($count + 1, caps.len()),
}
});
}
}
}

mod ffi;
mod misc;
mod regexdna;
Expand Down
82 changes: 82 additions & 0 deletions bench/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,85 @@ macro_rules! reallyhard2 { () => (r"\w+\s+Holmes") }

bench_match!(reallyhard2_1K, reallyhard2!(),
get_text(TXT_1K, reallyhard2_suffix()));


//
// Benchmarks to justify the short-haystack NFA fallthrough optimization
// implemented by `read_captures_at` in regex/src/exec.rs. See github issue
// #348.
//
// The procedure used to try to determine the right hardcoded cutoff
// for the short-haystack optimization in issue #348 is as follows.
//
// ```
// > cd bench
// > cargo bench --features re-rust short_hay | tee dfa-nfa.res
// > # modify the `MatchType::Dfa` branch in exec.rs:read_captures_at
// > # to just execute the nfa
// > cargo bench --features re-rust short_hay | tee nfa-only.res
// > cargo benchcmp dfa-nfa.res nfa-only.res
// ```
//
// The expected result is that short inputs will go faster under
// the nfa-only mode, but at some turnover point the dfa-nfa mode
// will start to win again. Unfortunately, that is not what happened.
// Instead there was no noticeable change in the bench results, so
// I've opted to just do the more conservative anchor optimization.
//
bench_captures!(short_haystack_1x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
String::from("aaaabbbbccccbbbdddd"));
bench_captures!(short_haystack_2x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(2).collect::<String>(),
repeat("dddd").take(2).collect::<String>(),
));
bench_captures!(short_haystack_3x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(3).collect::<String>(),
repeat("dddd").take(3).collect::<String>(),
));
bench_captures!(short_haystack_4x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(4).collect::<String>(),
repeat("dddd").take(4).collect::<String>(),
));
bench_captures!(short_haystack_10x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(10).collect::<String>(),
repeat("dddd").take(10).collect::<String>(),
));
bench_captures!(short_haystack_100x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(100).collect::<String>(),
repeat("dddd").take(100).collect::<String>(),
));
bench_captures!(short_haystack_1000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(1000).collect::<String>(),
repeat("dddd").take(1000).collect::<String>(),
));
bench_captures!(short_haystack_10000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(10000).collect::<String>(),
repeat("dddd").take(10000).collect::<String>(),
));
bench_captures!(short_haystack_100000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(100000).collect::<String>(),
repeat("dddd").take(100000).collect::<String>(),
));
bench_captures!(short_haystack_1000000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(1000000).collect::<String>(),
repeat("dddd").take(1000000).collect::<String>(),
));
14 changes: 9 additions & 5 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -554,12 +554,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
})
}
MatchType::Dfa => {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
if self.ro.nfa.is_anchored_start {
self.captures_nfa(slots, text, start)
} else {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
}
MatchType::DfaAnchoredReverse => {
Expand Down

0 comments on commit 526bc6b

Please sign in to comment.