syntax: factor out common prefixes of alternations

It is generally quite subtle to reason clearly about how this actually helps things in a finite automata based regex engine, but this sort of factoring can lead to lots of improvements: * We do use a bounded backtracker, so "pushing branches" down will help things there, just like it would with a classical backtracker. * It may lead to better literal extraction due to the simpler regex. Whether prefix factoring is really to blame here is somewhat unclear, but some downstream optimizations are more brittle than others. For example, the "reverse inner" optimization requires examining a "top level" concatenation to find literals to search for. By factoring out a common prefix, we potentially expand the number of regexes that have a top-level concat. For example, `\wfoo|\wbar` has no top-level concat but `\w(?:foo|bar)` does. * It should lead to faster matching even in finite automata oriented engines like the PikeVM, and also faster construction of DFAs (lazy or not). Namely, by pushing the branches down, we make it so they are visited less frequently, and thus the constant state shuffling caused by branches is reduced. The prefix extraction could be better, as mentioned in the comments, but this is a good start.
rust-lang · Mar 15, 2023 · a200fe1 · a200fe1
1 parent e8c59f0
commit a200fe1
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 6 deletions.
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -429,12 +429,14 @@ impl Hir {
             return new.pop().unwrap();
         }
         // Now that it's completely flattened, look for the special case of
-        // 'char1|char2|...|charN' and collapse that into a class. Note that we
-        // look for 'char' first and then bytes. The issue here is that if we
-        // find both non-ASCII codepoints and non-ASCII singleton bytes, then
-        // it isn't actually possible to smush them into a single class. So we
-        // look for all chars and then all bytes, and don't handle anything
-        // else.
+        // 'char1|char2|...|charN' and collapse that into a class. Note that
+        // we look for 'char' first and then bytes. The issue here is that if
+        // we find both non-ASCII codepoints and non-ASCII singleton bytes,
+        // then it isn't actually possible to smush them into a single class.
+        // (Because classes are either "all codepoints" or "all bytes." You
+        // can have a class that both matches non-ASCII but valid UTF-8 and
+        // invalid UTF-8.) So we look for all chars and then all bytes, and
+        // don't handle anything else.
         if let Some(singletons) = singleton_chars(&new) {
             let it = singletons
                 .into_iter()
@@ -455,6 +457,14 @@ impl Hir {
         if let Some(cls) = class_bytes(&new) {
             return Hir::class(cls);
         }
+        // Factor out a common prefix if we can, which might potentially
+        // simplify the expression and unlock other optimizations downstream.
+        // It also might generally make NFA matching and DFA construction
+        // faster by reducing the scope of branching in the regex.
+        new = match lift_common_prefix(new) {
+            Ok(hir) => return hir,
+            Err(unchanged) => unchanged,
+        };
         let props = Properties::alternation(&new);
         Hir { kind: HirKind::Alternation(new), props }
     }
@@ -2251,6 +2261,65 @@ fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
     Some(singletons)
 }
 
+/// Looks for a common prefix in the list of alternation branches given. If one
+/// is found, then an equivalent but (hopefully) simplified Hir is returned.
+/// Otherwise, the original given list of branches is returned unmodified.
+///
+/// This is not quite as good as it could be. Right now, it requires that
+/// all branches are 'Concat' expressions. It also doesn't do well with
+/// literals. For example, given 'foofoo|foobar', it will not refactor it to
+/// 'foo(?:foo|bar)' because literals are flattened into their own special
+/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
+/// instead of a string of bytes because of this. Otherwise, handling the
+/// current representation in this routine will be pretty gnarly. Sigh.)
+fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
+    if hirs.len() <= 1 {
+        return Err(hirs);
+    }
+    let mut prefix = match hirs[0].kind() {
+        HirKind::Concat(ref xs) => &**xs,
+        _ => return Err(hirs),
+    };
+    if prefix.is_empty() {
+        return Err(hirs);
+    }
+    for h in hirs.iter().skip(1) {
+        let concat = match h.kind() {
+            HirKind::Concat(ref xs) => xs,
+            _ => return Err(hirs),
+        };
+        let common_len = prefix
+            .iter()
+            .zip(concat.iter())
+            .take_while(|(x, y)| x == y)
+            .count();
+        prefix = &prefix[..common_len];
+        if prefix.is_empty() {
+            return Err(hirs);
+        }
+    }
+    let len = prefix.len();
+    assert_ne!(0, len);
+    let mut prefix_concat = vec![];
+    let mut suffix_alts = vec![];
+    for h in hirs {
+        let mut concat = match h.into_kind() {
+            HirKind::Concat(xs) => xs,
+            // We required all sub-expressions to be
+            // concats above, so we're only here if we
+            // have a concat.
+            _ => unreachable!(),
+        };
+        suffix_alts.push(Hir::concat(concat.split_off(len)));
+        if prefix_concat.is_empty() {
+            prefix_concat = concat;
+        }
+    }
+    let mut concat = prefix_concat;
+    concat.push(Hir::alternation(suffix_alts));
+    Ok(Hir::concat(concat))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -3428,5 +3428,35 @@ mod tests {
             t("a|b|c|d|e|f|x|y|z"),
             hir_uclass(&[('a', 'f'), ('x', 'z')]),
         );
+        // Tests that we lift common prefixes out of an alternation.
+        assert_eq!(
+            t("[A-Z]foo|[A-Z]quux"),
+            hir_cat(vec![
+                hir_uclass(&[('A', 'Z')]),
+                hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
+            ]),
+        );
+        assert_eq!(
+            t("[A-Z][A-Z]|[A-Z]quux"),
+            hir_cat(vec![
+                hir_uclass(&[('A', 'Z')]),
+                hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
+            ]),
+        );
+        assert_eq!(
+            t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
+            hir_cat(vec![
+                hir_uclass(&[('A', 'Z')]),
+                hir_uclass(&[('A', 'Z')]),
+                hir_alt(vec![Hir::empty(), hir_lit("quux")]),
+            ]),
+        );
+        assert_eq!(
+            t("[A-Z]foo|[A-Z]foobar"),
+            hir_cat(vec![
+                hir_uclass(&[('A', 'Z')]),
+                hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
+            ]),
+        );
     }
 }