From a3723ac132540ef51a7e99d3598a6d6607ed3975 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 12 Sep 2016 16:16:23 -0400 Subject: [PATCH] Disable all literal optimizations when a pattern is partially anchored. When a pattern is partially anchored and literal prefixes are detected, we would scan for those prefixes like normal. On a match, we'd verify whether the text starting at that prefix matched the rest of the regex. This process doesn't work that well for partially anchored regexes, since the prefix match presupposes that the regex is allowed to match at that position. But if, say, a regex like `^z|a` finds a `z` in the middle of the string, then it has lost the fact that `z` needs to appear at the beginning of the string, and can therefore falsely report a match. We could spend some effort and make this case work, but the literal optimizer is already too complex. We need to simplify it to make future optimizations like this possible. Fixes #280. --- src/compile.rs | 6 ------ src/exec.rs | 16 +++++++++++++--- src/prog.rs | 8 -------- tests/macros.rs | 11 +++++++++++ tests/regression.rs | 4 ++++ 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/compile.rs b/src/compile.rs index 32d1f6ac9b..9db743f489 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -143,9 +143,7 @@ impl Compiler { // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; self.compiled.is_anchored_start = expr.is_anchored_start(); - self.compiled.has_anchored_start = expr.has_anchored_start(); self.compiled.is_anchored_end = expr.is_anchored_end(); - self.compiled.has_anchored_end = expr.has_anchored_end(); if self.compiled.needs_dotstar() { dotstar_patch = try!(self.c_dotstar()); self.compiled.start = dotstar_patch.entry; @@ -173,10 +171,6 @@ impl Compiler { exprs.iter().all(|e| e.is_anchored_start()); self.compiled.is_anchored_end = exprs.iter().all(|e| e.is_anchored_end()); - self.compiled.has_anchored_start = - exprs.iter().any(|e| e.has_anchored_start()); - self.compiled.has_anchored_end = - exprs.iter().any(|e| e.has_anchored_end()); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = try!(self.c_dotstar()); diff --git a/src/exec.rs b/src/exec.rs index e46ecfb6c2..62b0f0e2a7 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -222,6 +222,12 @@ impl ExecBuilder { .allow_bytes(!self.only_utf8); let expr = try!(parser.parse(pat)); bytes = bytes || expr.has_bytes(); + + if !expr.is_anchored_start() && expr.has_anchored_start() { + // Partial anchors unfortunately make it hard to use prefixes, + // so disable them. + prefixes = None; + } prefixes = prefixes.and_then(|mut prefixes| { if !prefixes.union_prefixes(&expr) { None @@ -229,6 +235,12 @@ impl ExecBuilder { Some(prefixes) } }); + + if !expr.is_anchored_end() && expr.has_anchored_end() { + // Partial anchors unfortunately make it hard to use suffixes, + // so disable them. + suffixes = None; + } suffixes = suffixes.and_then(|mut suffixes| { if !suffixes.union_suffixes(&expr) { None @@ -1114,9 +1126,7 @@ impl ExecReadOnly { // create two sets of literals: all of them and then the subset that // aren't anchored. We would then only search for all of them when at // the beginning of the input and use the subset in all other cases. - if self.res.len() == 1 - && !self.nfa.has_anchored_start - && !self.nfa.has_anchored_end { + if self.res.len() == 1 { if self.nfa.prefixes.complete() { return if self.nfa.is_anchored_start { Literal(MatchLiteralType::AnchoredStart) diff --git a/src/prog.rs b/src/prog.rs index 41ebde009b..36f2aff879 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -52,12 +52,6 @@ pub struct Program { pub is_anchored_start: bool, /// Whether the regex must match at the end of the input. pub is_anchored_end: bool, - /// Whether the regex has at least one matchable sub-expression that must - /// match from the start of the input. - pub has_anchored_start: bool, - /// Whether the regex has at least one matchable sub-expression that must - /// match at the end of the input. - pub has_anchored_end: bool, /// Whether this program contains a Unicode word boundary instruction. pub has_unicode_word_boundary: bool, /// A possibly empty machine for very quickly matching prefix literals. @@ -97,8 +91,6 @@ impl Program { is_reverse: false, is_anchored_start: false, is_anchored_end: false, - has_anchored_start: false, - has_anchored_end: false, has_unicode_word_boundary: false, prefixes: LiteralSearcher::empty(), dfa_size_limit: 2 * (1<<20), diff --git a/tests/macros.rs b/tests/macros.rs index f9e8912630..34627cf260 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -8,6 +8,17 @@ macro_rules! findall { // Macros for automatically producing tests. +macro_rules! ismatch { + ($name:ident, $re:expr, $text:expr, $ismatch:expr) => { + #[test] + fn $name() { + let text = text!($text); + let re = regex!($re); + assert!($ismatch == re.is_match(text)); + } + }; +} + macro_rules! mat( ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( #[test] diff --git a/tests/regression.rs b/tests/regression.rs index 3b7a1fe917..68204d9f7c 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -64,3 +64,7 @@ matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1)); // See: https://github.com/rust-lang-nursery/regex/issues/264 mat!(ascii_boundary_no_capture, u!(r"(?-u)\B"), "\u{28f3e}", Some((0, 0))); mat!(ascii_boundary_capture, u!(r"(?-u)(\B)"), "\u{28f3e}", Some((0, 0))); + +// See: https://github.com/rust-lang-nursery/regex/issues/280 +ismatch!(partial_anchor_alternate_begin, u!(r"^a|z"), "yyyyya", false); +ismatch!(partial_anchor_alternate_end, u!(r"a$|z"), "ayyyyy", false);