From 4fab6c127eb5b9ee8ab6819983351736d2def0fa Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 16 Apr 2016 11:46:29 -0400 Subject: [PATCH] Add RE2 and TCL to the benchmark harness. This also adds a new utility, regex-run-one, to the benchmark suite. This utility is a CLI tool that lets one count the number of regex matches for any of the regex engines in the benchmark harness. e.g., regex-run-one tcl '\w{5}z\w{5}' my-file Will count the number of times the regex '\w{5}z\w{5}' matches in my-file. Supported engines are: pcre1, pcre2, onig, re2, rust, rust-bytes and tcl. --- .travis.yml | 3 +- benches/Cargo.toml | 62 +- benches/build.rs | 33 +- benches/compile | 6 + benches/log/.gitignore | 1 + benches/run | 42 + benches/src/bench.rs | 241 +++++ benches/src/bench_rust.rs | 30 - benches/src/bench_rust_bytes.rs | 28 - benches/src/data/.gitignore | 3 + benches/src/{ => data}/1K.txt | 0 benches/src/{ => data}/1MB.txt | 0 benches/src/{ => data}/32.txt | 0 benches/src/{ => data}/32K.txt | 0 .../sherlock.txt} | 0 .../src/{bench_rust_plugin.rs => ffi/mod.rs} | 24 +- benches/src/{bench_onig.rs => ffi/onig.rs} | 41 +- benches/src/{bench_pcre.rs => ffi/pcre1.rs} | 61 +- benches/src/{bench_pcre2.rs => ffi/pcre2.rs} | 218 ++--- benches/src/ffi/re2.cpp | 50 + benches/src/ffi/re2.h | 873 ++++++++++++++++++ benches/src/ffi/re2.rs | 155 ++++ benches/src/ffi/tcl.rs | 241 +++++ benches/src/main.rs | 157 ++++ benches/src/misc.rs | 178 ++-- benches/src/sherlock.rs | 175 ++-- run-bench | 60 -- src/exec.rs | 12 +- 28 files changed, 2131 insertions(+), 563 deletions(-) create mode 100755 benches/compile create mode 100644 benches/log/.gitignore create mode 100755 benches/run create mode 100644 benches/src/bench.rs delete mode 100644 benches/src/bench_rust.rs delete mode 100644 benches/src/bench_rust_bytes.rs create mode 100644 benches/src/data/.gitignore rename benches/src/{ => data}/1K.txt (100%) rename benches/src/{ => data}/1MB.txt (100%) rename benches/src/{ => data}/32.txt (100%) rename benches/src/{ => data}/32K.txt (100%) rename benches/src/{the-adventures-of-sherlock-holmes.txt => data/sherlock.txt} (100%) rename benches/src/{bench_rust_plugin.rs => ffi/mod.rs} (57%) rename benches/src/{bench_onig.rs => ffi/onig.rs} (54%) rename benches/src/{bench_pcre.rs => ffi/pcre1.rs} (80%) rename benches/src/{bench_pcre2.rs => ffi/pcre2.rs} (78%) create mode 100644 benches/src/ffi/re2.cpp create mode 100644 benches/src/ffi/re2.h create mode 100644 benches/src/ffi/re2.rs create mode 100644 benches/src/ffi/tcl.rs create mode 100644 benches/src/main.rs delete mode 100755 run-bench diff --git a/.travis.yml b/.travis.yml index 0a110a9452..36babbfaea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,8 +20,7 @@ script: - if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then travis_wait ./run-bench rust; travis_wait ./run-bench rust-bytes --no-run; - travis_wait ./run-bench rust-plugin --no-run; - travis_wait ./run-bench pcre --no-run; + travis_wait ./run-bench pcre1 --no-run; travis_wait ./run-bench onig --no-run; travis_wait cargo test --verbose --manifest-path=regex_macros/Cargo.toml; fi diff --git a/benches/Cargo.toml b/benches/Cargo.toml index d69c9d9695..3155cd804c 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -8,74 +8,58 @@ repository = "https://github.com/rust-lang/regex" documentation = "http://doc.rust-lang.org/regex/regex_syntax/index.html" homepage = "https://github.com/rust-lang/regex" description = "Regex benchmarks for Rust's and other engines." - build = "build.rs" [dependencies] +docopt = "0.6" lazy_static = "0.1" libc = "0.2" onig = { version = "0.4", optional = true } libpcre-sys = { version = "0.2", optional = true } +memmap = "0.2" regex = { version = "0.1", path = ".." } regex_macros = { version = "0.1", path = "../regex_macros", optional = true } regex-syntax = { version = "0.3", path = "../regex-syntax" } +rustc-serialize = "0.3" [build-dependencies] +gcc = "0.3" pkg-config = "0.3" +[[bin]] +name = "regex-run-one" +path = "src/main.rs" +bench = false + # Use features to conditionally compile benchmarked regexes, since not every # regex works on every engine. Additionally, it is useful to be able to build # each benchmark individually, so that not all dependencies are required to # run only one benchmark. +# +# Note that when running benchmarks, only ONE feature should be set at a time. +# Doing anything else will probably result in weird "duplicate definition" +# compiler errors. +# +# Tip: use the run-bench script in the root of this repository to run +# benchmarks. [features] -re-pcre = ["libpcre-sys"] +re-pcre1 = ["libpcre-sys"] re-pcre2 = [] re-onig = ["onig"] +re-re2 = [] re-rust = [] re-rust-bytes = [] re-rust-plugin = ["regex_macros"] +re-tcl = [] -# Run the benchmarks on the default behavior of Regex::new. -[[bench]] -name = "rust" -path = "src/bench_rust.rs" -test = false -bench = true - -# Run the benchmarks on the default behavior of bytes::Regex::new. -[[bench]] -name = "rust-bytes" -path = "src/bench_rust_bytes.rs" -test = false -bench = true - -# Run the benchmarks on the default behavior of the `regex!` compiler plugin. -[[bench]] -name = "rust-plugin" -path = "src/bench_rust_plugin.rs" -test = false -bench = true - -# Run the benchmarks on PCRE. [[bench]] -name = "pcre" -path = "src/bench_pcre.rs" +name = "bench" +path = "src/bench.rs" test = false bench = true -# Run the benchmarks on PCRE2. -[[bench]] -name = "pcre2" -path = "src/bench_pcre2.rs" -test = false -bench = true - -# Run the benchmarks on Oniguruma. -[[bench]] -name = "onig" -path = "src/bench_onig.rs" -test = false -bench = true +[profile.release] +debug = true [profile.bench] debug = true diff --git a/benches/build.rs b/benches/build.rs index 9ad6852d46..628c2b25d6 100644 --- a/benches/build.rs +++ b/benches/build.rs @@ -8,13 +8,12 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +extern crate gcc; extern crate pkg_config; use std::env; use std::process; -use pkg_config::Config; - macro_rules! we { ($($tt:tt)*) => {{ use std::io::Write; @@ -23,11 +22,37 @@ macro_rules! we { } fn main() { + // We only need to look for PCRE2 and RE2 because we roll the FFI bindings + // for those libraries ourselves from scratch. For PCRE1 and Oniguruma, we + // rely on other crates that do something similar to the dance below for + // us. + let wants_pcre2 = env::var("CARGO_FEATURE_RE_PCRE2").is_ok(); - let has_pcre2 = - Config::new().atleast_version("10.21").find("libpcre2-8").is_ok(); + let has_pcre2 = pkg_config::Config::new().find("libpcre2-8").is_ok(); if wants_pcre2 && !has_pcre2 { we!("pcre2 cannot be found by pkg-config"); process::exit(1); } + + let wants_re2 = env::var("CARGO_FEATURE_RE_RE2").is_ok(); + let has_re2 = pkg_config::Config::new().find("re2").is_ok(); + if wants_re2 { + if !has_re2 { + we!("re2 cannot be found by pkg-config"); + process::exit(1); + } + gcc::Config::new() + .cpp(true) + .flag("-std=c++11") + .file("src/ffi/re2.cpp") + .compile("libcre2.a"); + println!("cargo:rustc-link-lib=re2"); + } + + let wants_tcl = env::var("CARGO_FEATURE_RE_TCL").is_ok(); + let has_tcl = pkg_config::Config::new().find("tcl").is_ok(); + if wants_tcl && !has_tcl { + we!("tcl cannot be found by pkg-config"); + process::exit(1); + } } diff --git a/benches/compile b/benches/compile new file mode 100755 index 0000000000..8825ad1955 --- /dev/null +++ b/benches/compile @@ -0,0 +1,6 @@ +#!/bin/sh + +exec cargo build \ + --release \ + --features 're-onig re-pcre1 re-pcre2 re-re2 re-rust re-rust-bytes re-tcl' \ + "$@" diff --git a/benches/log/.gitignore b/benches/log/.gitignore new file mode 100644 index 0000000000..a9a5aecf42 --- /dev/null +++ b/benches/log/.gitignore @@ -0,0 +1 @@ +tmp diff --git a/benches/run b/benches/run new file mode 100755 index 0000000000..79feab05ce --- /dev/null +++ b/benches/run @@ -0,0 +1,42 @@ +#!/bin/bash + +usage() { + echo "Usage: $(basename $0) [rust | rust-bytes | rust-plugin | pcre1 | pcre2 | re2 | onig | tcl ]" >&2 + exit 1 +} + +if [ $# = 0 ] || [ $1 = '-h' ] || [ $1 = '--help' ]; then + usage +fi + +which="$1" +shift +case $which in + rust) + exec cargo bench --bench bench --features re-rust "$@" + ;; + rust-bytes) + exec cargo bench --bench bench --features re-rust-bytes "$@" + ;; + rust-plugin) + exec cargo bench --bench bench --features re-rust-plugin "$@" + ;; + re2) + exec cargo bench --bench bench --features re-re2 "$@" + ;; + pcre1) + exec cargo bench --bench bench --features re-pcre1 "$@" + ;; + pcre2) + exec cargo bench --bench bench --features re-pcre2 "$@" + ;; + onig) + exec cargo bench --bench bench --features re-onig "$@" + ;; + tcl) + exec cargo bench --bench bench --features re-tcl "$@" + ;; + *) + usage + ;; +esac diff --git a/benches/src/bench.rs b/benches/src/bench.rs new file mode 100644 index 0000000000..d0804d42bf --- /dev/null +++ b/benches/src/bench.rs @@ -0,0 +1,241 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// Enable the benchmarking harness. +#![feature(test)] + +// If we're benchmarking the Rust regex plugin, then pull that in. +// This will bring a `regex!` macro into scope. +#![cfg_attr(feature = "re-rust-plugin", feature(plugin))] +#![cfg_attr(feature = "re-rust-plugin", plugin(regex_macros))] + +#[macro_use] +extern crate lazy_static; +#[cfg(not(any(feature = "re-rust", feature = "re-rust-bytes")))] +extern crate libc; +#[cfg(feature = "re-pcre1")] +extern crate libpcre_sys; +#[cfg(feature = "re-onig")] +extern crate onig; +#[cfg(any( + feature = "re-rust", + feature = "re-rust-bytes", + feature = "re-rust-plugin", + ))] +extern crate regex; +#[cfg(feature = "re-rust")] +extern crate regex_syntax; +extern crate test; + + +#[cfg(feature = "re-onig")] +pub use ffi::onig::Regex; +#[cfg(feature = "re-pcre1")] +pub use ffi::pcre1::Regex; +#[cfg(feature = "re-pcre2")] +pub use ffi::pcre2::Regex; +#[cfg(feature = "re-re2")] +pub use ffi::re2::Regex; +#[cfg(any(feature = "re-rust", feature = "re-rust-plugin"))] +pub use regex::Regex; +#[cfg(feature = "re-rust-bytes")] +pub use regex::bytes::Regex; +#[cfg(feature = "re-tcl")] +pub use ffi::tcl::Regex; + +// Usage: regex!(pattern) +// +// Builds a ::Regex from a borrowed string. This is used in every regex +// engine except for the Rust plugin, because the plugin itself defines the +// same macro. +// +// Due to macro scoping rules, this definition only applies for the modules +// defined below. Effectively, it allows us to use the same tests for both +// native and dynamic regexes. +#[cfg(not(feature = "re-rust-plugin"))] +macro_rules! regex { + ($re:expr) => { ::Regex::new($re).unwrap() } +} + +// Usage: text!(haystack) +// +// Builds a ::Text from an owned string. +// +// This macro is called on every input searched in every benchmark. It is +// called exactly once per benchmark and its time is not included in the +// benchmark timing. +// +// The text given to the macro is always a String, which is guaranteed to be +// valid UTF-8. +// +// The return type should be an owned value that can deref to whatever the +// regex accepts in its `is_match` and `find_iter` methods. +#[cfg(feature = "re-tcl")] +macro_rules! text { + ($text:expr) => {{ + use ffi::tcl::Text; + Text::new($text) + }} +} + +#[cfg(feature = "re-rust-bytes")] +macro_rules! text { + ($text:expr) => {{ + let text: String = $text; + text.into_bytes() + }} +} + +#[cfg(any( + feature = "re-onig", + feature = "re-pcre1", + feature = "re-pcre2", + feature = "re-re2", + feature = "re-rust", + feature = "re-rust-plugin", + ))] +macro_rules! text { + ($text:expr) => { $text } +} + +// The type of the value yielded by the `text!` macro defined above. +#[cfg(feature = "re-tcl")] +type Text = ffi::tcl::Text; +#[cfg(feature = "re-rust-bytes")] +type Text = Vec; +#[cfg(any( + feature = "re-onig", + feature = "re-pcre1", + feature = "re-pcre2", + feature = "re-re2", + feature = "re-rust", + feature = "re-rust-plugin", + ))] +type Text = String; + +// Macros for writing benchmarks easily. We provide macros for benchmarking +// matches, non-matches and for finding all successive non-overlapping matches +// in a string (including a check that the count is correct). + +// USAGE: bench_match!(name, pattern, haystack) +// +// This benchmarks how fast a regular expression can report whether it matches +// a particular haystack. If the regex doesn't match, then the benchmark fails. +// Regexes are compiled exactly once. +// +// name is an identifier for the benchmark. +// +// pattern should be a &'static str representing the regular expression. +// +// haystack should be a String. +macro_rules! bench_match { + ($name:ident, $pattern:expr, $haystack:expr) => { + bench_is_match!($name, true, regex!($pattern), $haystack); + } +} + +// USAGE: bench_not_match!(name, pattern, haystack) +// +// This benchmarks how fast a regular expression can report whether it matches +// a particular haystack. If the regex matches, then the benchmark fails. +// Regexes are compiled exactly once. +// +// name is an identifier for the benchmark. +// +// pattern should be a &'static str representing the regular expression. +// +// haystack should be a String. +macro_rules! bench_not_match { + ($name:ident, $pattern:expr, $haystack:expr) => { + bench_is_match!($name, false, regex!($pattern), $haystack); + } +} + +// USAGE: bench_is_match!(name, is_match, regex, haystack) +// +// This benchmarks how fast a regular expression can report whether it matches +// a particular haystack. If the regex match status doesn't match is_match, +// then the benchmark fails. Regexes are compiled exactly once. +// +// name is an identifier for the benchmark. +// +// is_match reports whether the regex is expected to match the haystack or not. +// +// regex should be a ::Regex. +// +// haystack should be a String. +macro_rules! bench_is_match { + ($name:ident, $is_match:expr, $re:expr, $haystack:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + + // Why do we use lazy_static here? It seems sensible to just + // compile a regex outside of the b.iter() call and be done with + // it. However, it seems like Rust's benchmark harness actually + // calls the entire benchmark function multiple times. This doesn't + // factor into the timings reported in the benchmarks, but it does + // make the benchmarks take substantially longer to run because + // they're spending a lot of time recompiling regexes. + lazy_static! { + static ref RE: Mutex = Mutex::new($re); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + if re.is_match(&text) != $is_match { + if $is_match { + panic!("expected match, got not match"); + } else { + panic!("expected no match, got match"); + } + } + }); + } + } +} + +// USAGE: bench_find!(name, pattern, count, haystack) +// +// This benchmarks how fast a regular expression can count all successive +// non-overlapping matches in haystack. If the count reported does not match +// the count given, then the benchmark fails. +// +// name is an identifier for the benchmark. +// +// pattern should be a &'static str representing the regular expression. +// +// haystack should be a String. +macro_rules! bench_find { + ($name:ident, $pattern:expr, $count:expr, $haystack:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + + lazy_static! { + static ref RE: Mutex = Mutex::new(regex!($pattern)); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + let count = re.find_iter(&text).count(); + assert_eq!($count, count) + }); + } + } +} + +mod ffi; +mod misc; +mod sherlock; diff --git a/benches/src/bench_rust.rs b/benches/src/bench_rust.rs deleted file mode 100644 index 259c4a12b6..0000000000 --- a/benches/src/bench_rust.rs +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![feature(test)] - -#[macro_use] extern crate lazy_static; -extern crate regex; -extern crate regex_syntax; -extern crate test; - -pub use regex::Regex; - -// Due to macro scoping rules, this definition only applies for the modules -// defined below. Effectively, it allows us to use the same tests for both -// native and dynamic regexes. -macro_rules! regex( - ($re:expr) => {{ ::Regex::new($re).unwrap() }} -); - -mod misc; -mod rust_compile; -mod rust_parse; -mod sherlock; diff --git a/benches/src/bench_rust_bytes.rs b/benches/src/bench_rust_bytes.rs deleted file mode 100644 index 00b600ca73..0000000000 --- a/benches/src/bench_rust_bytes.rs +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![feature(test)] - -#[macro_use] extern crate lazy_static; -extern crate regex; -extern crate regex_syntax; -extern crate test; - -pub use regex::bytes::Regex; - -// Due to macro scoping rules, this definition only applies for the modules -// defined below. Effectively, it allows us to use the same tests for both -// native and dynamic regexes. -macro_rules! regex( - ($re:expr) => {{ ::Regex::new($re).unwrap() }} -); - -mod misc; -mod sherlock; diff --git a/benches/src/data/.gitignore b/benches/src/data/.gitignore new file mode 100644 index 0000000000..e7833ce8ff --- /dev/null +++ b/benches/src/data/.gitignore @@ -0,0 +1,3 @@ +10MB.txt +100MB.txt +1GB.txt diff --git a/benches/src/1K.txt b/benches/src/data/1K.txt similarity index 100% rename from benches/src/1K.txt rename to benches/src/data/1K.txt diff --git a/benches/src/1MB.txt b/benches/src/data/1MB.txt similarity index 100% rename from benches/src/1MB.txt rename to benches/src/data/1MB.txt diff --git a/benches/src/32.txt b/benches/src/data/32.txt similarity index 100% rename from benches/src/32.txt rename to benches/src/data/32.txt diff --git a/benches/src/32K.txt b/benches/src/data/32K.txt similarity index 100% rename from benches/src/32K.txt rename to benches/src/data/32K.txt diff --git a/benches/src/the-adventures-of-sherlock-holmes.txt b/benches/src/data/sherlock.txt similarity index 100% rename from benches/src/the-adventures-of-sherlock-holmes.txt rename to benches/src/data/sherlock.txt diff --git a/benches/src/bench_rust_plugin.rs b/benches/src/ffi/mod.rs similarity index 57% rename from benches/src/bench_rust_plugin.rs rename to benches/src/ffi/mod.rs index 5b428e76ae..7e14ea8bc0 100644 --- a/benches/src/bench_rust_plugin.rs +++ b/benches/src/ffi/mod.rs @@ -8,15 +8,17 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![feature(plugin, test)] -#![plugin(regex_macros)] +// We don't always use all of the method available to each regex engine because +// of conditional compilation and such. +#![allow(dead_code)] -#[macro_use] extern crate lazy_static; -extern crate regex; -extern crate regex_syntax; -extern crate test; - -pub use regex::Regex; - -mod misc; -mod sherlock; +#[cfg(feature = "re-onig")] +pub mod onig; +#[cfg(feature = "re-pcre1")] +pub mod pcre1; +#[cfg(feature = "re-pcre2")] +pub mod pcre2; +#[cfg(feature = "re-re2")] +pub mod re2; +#[cfg(feature = "re-tcl")] +pub mod tcl; diff --git a/benches/src/bench_onig.rs b/benches/src/ffi/onig.rs similarity index 54% rename from benches/src/bench_onig.rs rename to benches/src/ffi/onig.rs index 32a93caaf9..7d796d6178 100644 --- a/benches/src/bench_onig.rs +++ b/benches/src/ffi/onig.rs @@ -8,44 +8,27 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![feature(test)] - -#[macro_use] extern crate lazy_static; -extern crate onig; -extern crate test; - -use std::ops::Deref; +use onig; pub struct Regex(onig::Regex); unsafe impl Send for Regex {} -unsafe impl Sync for Regex {} - -impl Deref for Regex { - type Target = onig::Regex; - - fn deref(&self) -> &onig::Regex { - &self.0 - } -} impl Regex { - fn new(pattern: &str) -> Result { + pub fn new(pattern: &str) -> Result { onig::Regex::new(pattern).map(Regex) } - // Gah. onig's match function is anchored, but find is not. - fn is_match(&self, text: &str) -> bool { - self.search_with_options( + pub fn is_match(&self, text: &str) -> bool { + // Gah. onig's is_match function is anchored, but find is not. + self.0.search_with_options( text, 0, text.len(), onig::SEARCH_OPTION_NONE, None).is_some() } -} - -macro_rules! regex( - ($re:expr) => {{ - ::Regex::new($re).unwrap() - }} -); -mod misc; -mod sherlock; + pub fn find_iter<'r, 't>( + &'r self, + text: &'t str, + ) -> onig::FindMatches<'r, 't> { + self.0.find_iter(text) + } +} diff --git a/benches/src/bench_pcre.rs b/benches/src/ffi/pcre1.rs similarity index 80% rename from benches/src/bench_pcre.rs rename to benches/src/ffi/pcre1.rs index a430598274..6ad361e562 100644 --- a/benches/src/bench_pcre.rs +++ b/benches/src/ffi/pcre1.rs @@ -8,14 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![feature(test)] #![allow(non_snake_case)] -#[macro_use] extern crate lazy_static; -extern crate libc; -extern crate libpcre_sys; -extern crate test; - use std::ffi::{CString, CStr}; use std::fmt; use std::ptr; @@ -33,16 +27,12 @@ const PCRE_STUDY_JIT_COMPLETE: c_int = 0x0001; // We use libpcre-sys directly because the pcre crate has unavoidable // performance problems in its core matching routines. (e.g., It always // allocates an ovector.) -struct Regex { +pub struct Regex { code: *mut pcre, extra: *mut pcre_extra, } -// Regex can't be used safely from multiple threads simultaneously, so this is -// a lie and therefore unsafe. It is, however, convenient and fine for the -// purposes of benchmarking where a Regex is only ever used in one thread. unsafe impl Send for Regex {} -unsafe impl Sync for Regex {} impl Drop for Regex { fn drop(&mut self) { @@ -53,19 +43,13 @@ impl Drop for Regex { } } -struct Error { +pub struct Error { msg: String, offset: c_int, } -struct FindMatches<'r, 't> { - re: &'r Regex, - text: &'t str, - last_match_end: usize, -} - impl Regex { - fn new(pattern: &str) -> Result { + pub fn new(pattern: &str) -> Result { let pattern = CString::new(pattern.to_owned()).unwrap(); let mut errptr: *const c_char = ptr::null(); let mut erroffset: c_int = 0; @@ -100,7 +84,19 @@ impl Regex { Ok(Regex { code: code, extra: extra }) } - fn _match(&self, text: &str, start: usize) -> Option<(usize, usize)> { + pub fn is_match(&self, text: &str) -> bool { + self.find_at(text, 0).is_some() + } + + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + FindMatches { + re: self, + text: text, + last_match_end: 0, + } + } + + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { const OVEC_SIZE: usize = 15 * 3; // hopefully enough for benchmarks? let mut ovec: [c_int; OVEC_SIZE] = [0; OVEC_SIZE]; let err = unsafe { pcre_exec( @@ -121,25 +117,19 @@ impl Regex { Some((ovec[0] as usize, ovec[1] as usize)) } } +} - fn is_match(&mut self, text: &str) -> bool { - self._match(text, 0).is_some() - } - - fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { - FindMatches { - re: self, - text: text, - last_match_end: 0, - } - } +pub struct FindMatches<'r, 't> { + re: &'r Regex, + text: &'t str, + last_match_end: usize, } impl<'r, 't> Iterator for FindMatches<'r, 't> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { - match self.re._match(self.text, self.last_match_end) { + match self.re.find_at(self.text, self.last_match_end) { None => None, Some((s, e)) => { self.last_match_end = e; @@ -154,10 +144,3 @@ impl fmt::Debug for Error { write!(f, "PCRE error at {:?}: {}", self.offset, self.msg) } } - -macro_rules! regex( - ($re:expr) => { ::Regex::new($re).unwrap() } -); - -mod misc; -mod sherlock; diff --git a/benches/src/bench_pcre2.rs b/benches/src/ffi/pcre2.rs similarity index 78% rename from benches/src/bench_pcre2.rs rename to benches/src/ffi/pcre2.rs index 04d605a12a..d54dcc7c80 100644 --- a/benches/src/bench_pcre2.rs +++ b/benches/src/ffi/pcre2.rs @@ -8,15 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![feature(test)] #![allow(non_camel_case_types)] -#![allow(non_snake_case)] -#![allow(dead_code)] - -#[macro_use] -extern crate lazy_static; -extern crate libc; -extern crate test; use std::fmt; use std::ptr; @@ -24,117 +16,39 @@ use std::str; use libc::{c_int, c_void, size_t, uint8_t, uint32_t}; -const PCRE2_UCP: uint32_t = 0x00020000; -const PCRE2_UTF: uint32_t = 0x00080000; -const PCRE2_NO_UTF_CHECK: uint32_t = 0x40000000; -const PCRE2_JIT_COMPLETE: uint32_t = 0x00000001; -const PCRE2_ERROR_NOMATCH: c_int = -1; - -type code = c_void; - -type match_data = c_void; - -type compile_context = c_void; // unused - -type general_context = c_void; // unused - -type match_context = c_void; // unused - -extern { - fn pcre2_compile_8( - pattern: *const uint8_t, - len: size_t, - options: uint32_t, - error_code: *mut c_int, - error_offset: *mut size_t, - context: *mut compile_context, - ) -> *mut code; - - fn pcre2_code_free_8( - code: *mut code, - ); - - fn pcre2_match_data_create_from_pattern_8( - code: *const code, - context: *mut general_context, - ) -> *mut match_data; - - fn pcre2_match_data_free_8( - match_data: *mut match_data, - ); - - fn pcre2_get_ovector_pointer_8( - match_data: *mut match_data, - ) -> *mut size_t; - - fn pcre2_match_8( - code: *const code, - subject: *const uint8_t, - length: size_t, - startoffset: size_t, - options: uint32_t, - match_data: *mut match_data, - match_context: *mut match_context, - ) -> c_int; - - fn pcre2_jit_compile_8( - code: *const code, - options: uint32_t, - ) -> c_int; - - fn pcre2_jit_match_8( - code: *const code, - subject: *const uint8_t, - length: size_t, - startoffset: size_t, - options: uint32_t, - match_data: *mut match_data, - match_context: *mut match_context, - ) -> c_int; - - fn pcre2_get_error_message_8( - error_code: c_int, - buf: *mut uint8_t, - buflen: size_t, - ) -> c_int; -} - -struct Regex { +pub struct Regex { code: *mut code, match_data: *mut match_data, ovector: *mut size_t, } -// Regex can't be used safely from multiple threads simultaneously, so this is -// a lie and therefore unsafe. It is, however, convenient and fine for the -// purposes of benchmarking where a Regex is only ever used in one thread. unsafe impl Send for Regex {} -unsafe impl Sync for Regex {} impl Drop for Regex { fn drop(&mut self) { - unsafe { pcre2_code_free_8(self.code); } + unsafe { + pcre2_match_data_free_8(self.match_data); + pcre2_code_free_8(self.code); + } } } -struct Error { +pub struct Error { code: c_int, offset: size_t, } -struct FindMatches<'r, 't> { - re: &'r Regex, - text: &'t str, - last_match_end: usize, -} - impl Regex { - fn new(pattern: &str) -> Result { + pub fn new(pattern: &str) -> Result { let mut error_code: c_int = 0; let mut error_offset: size_t = 0; let code = unsafe { pcre2_compile_8( pattern.as_ptr(), pattern.len(), + // PCRE2 can get significantly faster in some cases depending + // on the permutation of these options (in particular, dropping + // UCP). We should endeavor to have a separate "ASCII compatible" + // benchmark. PCRE2_UCP | PCRE2_UTF, &mut error_code, &mut error_offset, @@ -164,7 +78,23 @@ impl Regex { Ok(Regex { code: code, match_data: match_data, ovector: ovector }) } - fn _match(&self, text: &str, start: usize) -> Option<(usize, usize)> { + pub fn is_match(&self, text: &str) -> bool { + self.find_at(text, 0).is_some() + } + + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + FindMatches { + re: self, + text: text, + last_match_end: 0, + } + } + + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + // The man pages for PCRE2 say that pcre2_jit_match is the fastest + // way to execute a JIT match because it skips sanity checks. We also + // explicitly disable the UTF-8 validity check, but it's probably not + // necessary. let err = unsafe { pcre2_jit_match_8( self.code, text.as_ptr(), @@ -179,30 +109,22 @@ impl Regex { } else if err < 0 { panic!("unknown error code: {:?}", err) } else { - let s = unsafe { *self.ovector }; - let e = unsafe { *self.ovector.offset(1) }; - Some((s, e)) + Some(unsafe { (*self.ovector, *self.ovector.offset(1)) }) } } +} - fn is_match(&self, text: &str) -> bool { - self._match(text, 0).is_some() - } - - fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { - FindMatches { - re: self, - text: text, - last_match_end: 0, - } - } +pub struct FindMatches<'r, 't> { + re: &'r Regex, + text: &'t str, + last_match_end: usize, } impl<'r, 't> Iterator for FindMatches<'r, 't> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { - match self.re._match(self.text, self.last_match_end) { + match self.re.find_at(self.text, self.last_match_end) { None => None, Some((s, e)) => { self.last_match_end = e; @@ -231,9 +153,69 @@ impl fmt::Debug for Error { } } -macro_rules! regex( - ($re:expr) => { ::Regex::new($re).unwrap() } -); +// PCRE2 FFI. We only wrap the bits we need. + +const PCRE2_UCP: uint32_t = 0x00020000; +const PCRE2_UTF: uint32_t = 0x00080000; +const PCRE2_NO_UTF_CHECK: uint32_t = 0x40000000; +const PCRE2_JIT_COMPLETE: uint32_t = 0x00000001; +const PCRE2_ERROR_NOMATCH: c_int = -1; + +type code = c_void; -mod misc; -mod sherlock; +type match_data = c_void; + +type compile_context = c_void; // unused + +type general_context = c_void; // unused + +type match_context = c_void; // unused + +extern { + fn pcre2_compile_8( + pattern: *const uint8_t, + len: size_t, + options: uint32_t, + error_code: *mut c_int, + error_offset: *mut size_t, + context: *mut compile_context, + ) -> *mut code; + + fn pcre2_code_free_8( + code: *mut code, + ); + + fn pcre2_match_data_create_from_pattern_8( + code: *const code, + context: *mut general_context, + ) -> *mut match_data; + + fn pcre2_match_data_free_8( + match_data: *mut match_data, + ); + + fn pcre2_get_ovector_pointer_8( + match_data: *mut match_data, + ) -> *mut size_t; + + fn pcre2_jit_compile_8( + code: *const code, + options: uint32_t, + ) -> c_int; + + fn pcre2_jit_match_8( + code: *const code, + subject: *const uint8_t, + length: size_t, + startoffset: size_t, + options: uint32_t, + match_data: *mut match_data, + match_context: *mut match_context, + ) -> c_int; + + fn pcre2_get_error_message_8( + error_code: c_int, + buf: *mut uint8_t, + buflen: size_t, + ) -> c_int; +} diff --git a/benches/src/ffi/re2.cpp b/benches/src/ffi/re2.cpp new file mode 100644 index 0000000000..cc75b87bcd --- /dev/null +++ b/benches/src/ffi/re2.cpp @@ -0,0 +1,50 @@ +#include +#include + +#include "re2.h" + +using namespace re2; + +extern "C" { + typedef void re2_regexp; + + typedef struct re2_string { + const char *text; + int len; + } re2_string; + + re2_regexp* re2_regexp_new(re2_string pat) { + re2::StringPiece re2_pat(pat.text, pat.len); + return reinterpret_cast(new RE2(re2_pat)); + } + + void re2_regexp_free(re2_regexp *re) { + delete reinterpret_cast(re); + } + + bool re2_regexp_match(re2_regexp *re, re2_string text, + int startpos, int endpos) { + RE2 *cpp_re = reinterpret_cast(re); + re2::StringPiece cpp_text(text.text, text.len); + + return cpp_re->Match(cpp_text, startpos, endpos, RE2::UNANCHORED, + NULL, 0); + } + + bool re2_regexp_find(re2_regexp *re, re2_string text, + int startpos, int endpos, + int *match_start, int *match_end) { + RE2 *cpp_re = reinterpret_cast(re); + re2::StringPiece cpp_text(text.text, text.len); + re2::StringPiece result; + bool matched; + + matched = cpp_re->Match(cpp_text, startpos, endpos, RE2::UNANCHORED, + &result, 1); + if (matched) { + *match_start = result.data() - cpp_text.data(); + *match_end = *match_start + result.length(); + } + return matched; + } +} diff --git a/benches/src/ffi/re2.h b/benches/src/ffi/re2.h new file mode 100644 index 0000000000..4a8c5c8a7e --- /dev/null +++ b/benches/src/ffi/re2.h @@ -0,0 +1,873 @@ +// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_RE2_H +#define RE2_RE2_H + +// C++ interface to the re2 regular-expression library. +// RE2 supports Perl-style regular expressions (with extensions like +// \d, \w, \s, ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the re2 library and hence supports +// its syntax for regular expressions, which is similar to Perl's with +// some of the more complicated things thrown away. In particular, +// backreferences and generalized assertions are not available, nor is \Z. +// +// See https://github.com/google/re2/wiki/Syntax for the syntax +// supported by RE2, and a comparison with PCRE and PERL regexps. +// +// For those not familiar with Perl's regular expressions, +// here are some examples of the most commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(RE2::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!RE2::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, the pattern and input text are interpreted as UTF-8. +// The RE2::Latin1 option causes them to be interpreted as Latin-1. +// +// Example: +// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern))); +// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUB-STRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched subpieces. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// string s; +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!RE2::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns: +// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// NOTE(rsc): Asking for substrings slows successful matches quite a bit. +// This may get a little faster in the future, but right now is slower +// than PCRE. On the other hand, failed matches run *very* fast (faster +// than PCRE), as do matches without substring extraction. +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(RE2::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PRE-COMPILED REGULAR EXPRESSIONS +// +// RE2 makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "RE2" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// RE2 pattern("h.*o"); +// while (ReadLine(&str)) { +// if (RE2::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// string var; +// int value; +// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// RE2::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// USING VARIABLE NUMBER OF ARGUMENTS +// +// The above operations require you to know the number of arguments +// when you write the code. This is not always possible or easy (for +// example, the regular expression may be calculated at run time). +// You can use the "N" version of the operations when the number of +// match arguments are determined at run time. +// +// Example: +// const RE2::Arg* args[10]; +// int n; +// // ... populate args with pointers to RE2::Arg values ... +// // ... set n to the number of RE2::Arg objects ... +// bool match = RE2::FullMatchN(input, pattern, args, n); +// +// The last statement is equivalent to +// +// bool match = RE2::FullMatch(input, pattern, +// *args[0], *args[1], ..., *args[n - 1]); +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include +#include +#include +#include +#include "re2/stringpiece.h" +#include "re2/variadic_function.h" + +#ifndef RE2_HAVE_LONGLONG +#define RE2_HAVE_LONGLONG 1 +#endif + +namespace re2 { + +using std::string; +using std::map; +class Prog; +class Regexp; + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "RE2" object is safe for +// concurrent use by multiple threads. +class RE2 { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + class Options; + + // Defined in set.h. + class Set; + + enum ErrorCode { + NoError = 0, + + // Unexpected error + ErrorInternal, + + // Parse errors + ErrorBadEscape, // bad escape sequence + ErrorBadCharClass, // bad character class + ErrorBadCharRange, // bad character class range + ErrorMissingBracket, // missing closing ] + ErrorMissingParen, // missing closing ) + ErrorTrailingBackslash, // trailing \ at end of regexp + ErrorRepeatArgument, // repeat argument missing, e.g. "*" + ErrorRepeatSize, // bad repetition argument + ErrorRepeatOp, // bad repetition operator + ErrorBadPerlOp, // bad perl operator + ErrorBadUTF8, // invalid UTF-8 in regexp + ErrorBadNamedCapture, // bad named capture group + ErrorPatternTooLarge // pattern too large (compile failed) + }; + + // Predefined common options. + // If you need more complicated things, instantiate + // an Option class, possibly passing one of these to + // the Option constructor, change the settings, and pass that + // Option class to the RE2 constructor. + enum CannedOptions { + DefaultOptions = 0, + Latin1, // treat input as Latin-1 (default UTF-8) + POSIX, // POSIX syntax, leftmost-longest match + Quiet // do not log about regexp parse errors + }; + + // Need to have the const char* and const string& forms for implicit + // conversions when passing string literals to FullMatch and PartialMatch. + // Otherwise the StringPiece form would be sufficient. +#ifndef SWIG + RE2(const char* pattern); + RE2(const string& pattern); +#endif + RE2(const StringPiece& pattern); + RE2(const StringPiece& pattern, const Options& option); + ~RE2(); + + // Returns whether RE2 was created properly. + bool ok() const { return error_code() == NoError; } + + // The string specification for this RE2. E.g. + // RE2 re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const string& pattern() const { return pattern_; } + + // If RE2 could not be created properly, returns an error string. + // Else returns the empty string. + const string& error() const { return *error_; } + + // If RE2 could not be created properly, returns an error code. + // Else returns RE2::NoError (== 0). + ErrorCode error_code() const { return error_code_; } + + // If RE2 could not be created properly, returns the offending + // portion of the regexp. + const string& error_arg() const { return error_arg_; } + + // Returns the program size, a very approximate measure of a regexp's "cost". + // Larger numbers are more expensive than smaller numbers. + int ProgramSize() const; + + // EXPERIMENTAL! SUBJECT TO CHANGE! + // Outputs the program fanout as a histogram bucketed by powers of 2. + // Returns the number of the largest non-empty bucket. + int ProgramFanout(map* histogram) const; + + // Returns the underlying Regexp; not for general use. + // Returns entire_regexp_ so that callers don't need + // to know about prefix_ and prefix_foldcase_. + re2::Regexp* Regexp() const { return entire_regexp_; } + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "pattern". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "string" for "text". + // You can pass in a "const char*" or a "string" or a "RE2" for "pattern". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, int)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "pattern" exactly + // b. The number of matched sub-patterns is >= number of supplied pointers + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); + static bool FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch; + + // Exactly like FullMatch(), except that "pattern" is allowed to match + // a substring of "text". + static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch; + + // Like FullMatch() and PartialMatch(), except that pattern has to + // match a prefix of "text", and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true. + static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume; + + // Like Consume(..), but does not anchor the match at the beginning of the + // string. That is, "pattern" need not start its match at the beginning of + // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next + // word in "s" and stores it in "word". + static bool FindAndConsumeN(StringPiece* input, const RE2& pattern, + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume; + + // Replace the first match of "pattern" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // string s = "yabba dabba doo"; + // CHECK(RE2::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(string *str, + const RE2& pattern, + const StringPiece& rewrite); + + // Like Replace(), except replaces successive non-overlapping occurrences + // of the pattern in the string with the rewrite. E.g. + // + // string s = "yabba dabba doo"; + // CHECK(RE2::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // Replacements are not subject to re-matching. + // + // Because GlobalReplace only replaces non-overlapping matches, + // replacing "ana" within "banana" makes only one replacement, not two. + // + // Returns the number of replacements made. + static int GlobalReplace(string *str, + const RE2& pattern, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + // + // REQUIRES: "text" must not alias any part of "*out". + static bool Extract(const StringPiece &text, + const RE2& pattern, + const StringPiece &rewrite, + string *out); + + // Escapes all potentially meaningful regexp characters in + // 'unquoted'. The returned string, used as a regular expression, + // will exactly match the original string. For example, + // 1.5-2.0? + // may become: + // 1\.5\-2\.0\? + static string QuoteMeta(const StringPiece& unquoted); + + // Computes range for any strings matching regexp. The min and max can in + // some cases be arbitrarily precise, so the caller gets to specify the + // maximum desired length of string returned. + // + // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any + // string s that is an anchored match for this regexp satisfies + // min <= s && s <= max. + // + // Note that PossibleMatchRange() will only consider the first copy of an + // infinitely repeated element (i.e., any regexp element followed by a '*' or + // '+' operator). Regexps with "{N}" constructions are not affected, as those + // do not compile down to infinite repetitions. + // + // Returns true on success, false on error. + bool PossibleMatchRange(string* min, string* max, int maxlen) const; + + // Generic matching interface + + // Type of match. + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH // Anchor at start and end + }; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. The overall match ($0) + // does not count: if the regexp is "(a)(b)", returns 2. + int NumberOfCapturingGroups() const; + + // Return a map from names to capturing indices. + // The map records the index of the leftmost group + // with the given name. + // Only valid until the re is deleted. + const map& NamedCapturingGroups() const; + + // Return a map from capturing indices to names. + // The map has no entries for unnamed groups. + // Only valid until the re is deleted. + const map& CapturingGroupNames() const; + + // General matching routine. + // Match against text starting at offset startpos + // and stopping the search at offset endpos. + // Returns true if match found, false if not. + // On a successful match, fills in match[] (up to nmatch entries) + // with information about submatches. + // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, + // setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar", + // match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL. + // + // Don't ask for more match information than you will use: + // runs much faster with nmatch == 1 than nmatch > 1, and + // runs even faster if nmatch == 0. + // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(), + // but will be handled correctly. + // + // Passing text == StringPiece(NULL, 0) will be handled like any other + // empty string, but note that on return, it will not be possible to tell + // whether submatch i matched the empty string or did not match: + // either way, match[i].data() == NULL. + bool Match(const StringPiece& text, + int startpos, + int endpos, + Anchor anchor, + StringPiece *match, + int nmatch) const; + + // Check that the given rewrite string is suitable for use with this + // regular expression. It checks that: + // * The regular expression has enough parenthesized subexpressions + // to satisfy all of the \N tokens in rewrite + // * The rewrite string doesn't have any syntax errors. E.g., + // '\' followed by anything other than a digit or '\'. + // A true return value guarantees that Replace() and Extract() won't + // fail because of a bad rewrite string. + bool CheckRewriteString(const StringPiece& rewrite, string* error) const; + + // Returns the maximum submatch needed for the rewrite to be done by + // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. + static int MaxSubmatch(const StringPiece& rewrite); + + // Append the "rewrite" string, with backslash subsitutions from "vec", + // to string "out". + // Returns true on success. This method can fail because of a malformed + // rewrite string. CheckRewriteString guarantees that the rewrite will + // be sucessful. + bool Rewrite(string *out, + const StringPiece &rewrite, + const StringPiece* vec, + int veclen) const; + + // Constructor options + class Options { + public: + // The options are (defaults in parentheses): + // + // utf8 (true) text and pattern are UTF-8; otherwise Latin-1 + // posix_syntax (false) restrict regexps to POSIX egrep syntax + // longest_match (false) search for longest match, not first match + // log_errors (true) log syntax and execution errors to ERROR + // max_mem (see below) approx. max memory footprint of RE2 + // literal (false) interpret string as literal, not regexp + // never_nl (false) never match \n, even if it is in regexp + // dot_nl (false) dot matches everything including new line + // never_capture (false) parse all parens as non-capturing + // case_sensitive (true) match is case-sensitive (regexp can override + // with (?i) unless in posix_syntax mode) + // + // The following options are only consulted when posix_syntax == true. + // (When posix_syntax == false these features are always enabled and + // cannot be turned off.) + // perl_classes (false) allow Perl's \d \s \w \D \S \W + // word_boundary (false) allow Perl's \b \B (word boundary and not) + // one_line (false) ^ and $ only match beginning and end of text + // + // The max_mem option controls how much memory can be used + // to hold the compiled form of the regexp (the Prog) and + // its cached DFA graphs. Code Search placed limits on the number + // of Prog instructions and DFA states: 10,000 for both. + // In RE2, those limits would translate to about 240 KB per Prog + // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a + // better job of keeping them small than Code Search did). + // Each RE2 has two Progs (one forward, one reverse), and each Prog + // can have two DFAs (one first match, one longest match). + // That makes 4 DFAs: + // + // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches + // if opt.longest_match() == false + // forward, longest-match - used for all ANCHOR_BOTH searches, + // and the other two kinds if + // opt.longest_match() == true + // reverse, first-match - never used + // reverse, longest-match - used as second phase for unanchored searches + // + // The RE2 memory budget is statically divided between the two + // Progs and then the DFAs: two thirds to the forward Prog + // and one third to the reverse Prog. The forward Prog gives half + // of what it has left over to each of its DFAs. The reverse Prog + // gives it all to its longest-match DFA. + // + // Once a DFA fills its budget, it flushes its cache and starts over. + // If this happens too often, RE2 falls back on the NFA implementation. + + // For now, make the default budget something close to Code Search. + static const int kDefaultMaxMem = 8<<20; + + enum Encoding { + EncodingUTF8 = 1, + EncodingLatin1 + }; + + Options() : + encoding_(EncodingUTF8), + posix_syntax_(false), + longest_match_(false), + log_errors_(true), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + dot_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) { + } + + /*implicit*/ Options(CannedOptions); + + Encoding encoding() const { return encoding_; } + void set_encoding(Encoding encoding) { encoding_ = encoding; } + + // Legacy interface to encoding. + // TODO(rsc): Remove once clients have been converted. + bool utf8() const { return encoding_ == EncodingUTF8; } + void set_utf8(bool b) { + if (b) { + encoding_ = EncodingUTF8; + } else { + encoding_ = EncodingLatin1; + } + } + + bool posix_syntax() const { return posix_syntax_; } + void set_posix_syntax(bool b) { posix_syntax_ = b; } + + bool longest_match() const { return longest_match_; } + void set_longest_match(bool b) { longest_match_ = b; } + + bool log_errors() const { return log_errors_; } + void set_log_errors(bool b) { log_errors_ = b; } + + int64_t max_mem() const { return max_mem_; } + void set_max_mem(int64_t m) { max_mem_ = m; } + + bool literal() const { return literal_; } + void set_literal(bool b) { literal_ = b; } + + bool never_nl() const { return never_nl_; } + void set_never_nl(bool b) { never_nl_ = b; } + + bool dot_nl() const { return dot_nl_; } + void set_dot_nl(bool b) { dot_nl_ = b; } + + bool never_capture() const { return never_capture_; } + void set_never_capture(bool b) { never_capture_ = b; } + + bool case_sensitive() const { return case_sensitive_; } + void set_case_sensitive(bool b) { case_sensitive_ = b; } + + bool perl_classes() const { return perl_classes_; } + void set_perl_classes(bool b) { perl_classes_ = b; } + + bool word_boundary() const { return word_boundary_; } + void set_word_boundary(bool b) { word_boundary_ = b; } + + bool one_line() const { return one_line_; } + void set_one_line(bool b) { one_line_ = b; } + + void Copy(const Options& src) { + encoding_ = src.encoding_; + posix_syntax_ = src.posix_syntax_; + longest_match_ = src.longest_match_; + log_errors_ = src.log_errors_; + max_mem_ = src.max_mem_; + literal_ = src.literal_; + never_nl_ = src.never_nl_; + dot_nl_ = src.dot_nl_; + never_capture_ = src.never_capture_; + case_sensitive_ = src.case_sensitive_; + perl_classes_ = src.perl_classes_; + word_boundary_ = src.word_boundary_; + one_line_ = src.one_line_; + } + + int ParseFlags() const; + + private: + Encoding encoding_; + bool posix_syntax_; + bool longest_match_; + bool log_errors_; + int64_t max_mem_; + bool literal_; + bool never_nl_; + bool dot_nl_; + bool never_capture_; + bool case_sensitive_; + bool perl_classes_; + bool word_boundary_; + bool one_line_; + + //DISALLOW_COPY_AND_ASSIGN(Options); + Options(const Options&); + void operator=(const Options&); + }; + + // Returns the options set in the constructor. + const Options& options() const { return options_; }; + + // Argument converters; see below. + static inline Arg CRadix(short* x); + static inline Arg CRadix(unsigned short* x); + static inline Arg CRadix(int* x); + static inline Arg CRadix(unsigned int* x); + static inline Arg CRadix(long* x); + static inline Arg CRadix(unsigned long* x); + #if RE2_HAVE_LONGLONG + static inline Arg CRadix(long long* x); + static inline Arg CRadix(unsigned long long* x); + #endif + + static inline Arg Hex(short* x); + static inline Arg Hex(unsigned short* x); + static inline Arg Hex(int* x); + static inline Arg Hex(unsigned int* x); + static inline Arg Hex(long* x); + static inline Arg Hex(unsigned long* x); + #if RE2_HAVE_LONGLONG + static inline Arg Hex(long long* x); + static inline Arg Hex(unsigned long long* x); + #endif + + static inline Arg Octal(short* x); + static inline Arg Octal(unsigned short* x); + static inline Arg Octal(int* x); + static inline Arg Octal(unsigned int* x); + static inline Arg Octal(long* x); + static inline Arg Octal(unsigned long* x); + #if RE2_HAVE_LONGLONG + static inline Arg Octal(long long* x); + static inline Arg Octal(unsigned long long* x); + #endif + + private: + void Init(const StringPiece& pattern, const Options& options); + + bool DoMatch(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const args[], + int n) const; + + re2::Prog* ReverseProg() const; + + string pattern_; // string regular expression + Options options_; // option flags + string prefix_; // required prefix (before regexp_) + bool prefix_foldcase_; // prefix is ASCII case-insensitive + re2::Regexp* entire_regexp_; // parsed regular expression + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed + re2::Prog* prog_; // compiled program for regexp + bool is_one_pass_; // can use prog_->SearchOnePass? + + mutable re2::Prog* rprog_; // reverse program for regexp + mutable const string* error_; // Error indicator + // (or points to empty string) + mutable ErrorCode error_code_; // Error code + mutable string error_arg_; // Fragment of regexp showing error + mutable int num_captures_; // Number of capturing groups + + // Map from capture names to indices + mutable const map* named_groups_; + + // Map from capture indices to names + mutable const map* group_names_; + + // Onces for lazy computations. + mutable std::once_flag rprog_once_; + mutable std::once_flag num_captures_once_; + mutable std::once_flag named_groups_once_; + mutable std::once_flag group_names_once_; + + //DISALLOW_COPY_AND_ASSIGN(RE2); + RE2(const RE2&); + void operator=(const RE2&); +}; + +/***** Implementation details *****/ + +// Hex/Octal/Binary? + +// Special class for parsing into objects that define a ParseFrom() method +template +class _RE2_MatchObject { + public: + static inline bool Parse(const char* str, int n, void* dest) { + if (dest == NULL) return true; + T* object = reinterpret_cast(dest); + return object->ParseFrom(str, n); + } +}; + +class RE2::Arg { + public: + // Empty constructor so we can declare arrays of RE2::Arg + Arg(); + + // Constructor specially designed for NULL arguments + Arg(void*); + + typedef bool (*Parser)(const char* str, int n, void* dest); + +// Type-specific parsers +#define MAKE_PARSER(type,name) \ + Arg(type* p) : arg_(p), parser_(name) { } \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ + + + MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_char); + MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(short, parse_short); + MAKE_PARSER(unsigned short, parse_ushort); + MAKE_PARSER(int, parse_int); + MAKE_PARSER(unsigned int, parse_uint); + MAKE_PARSER(long, parse_long); + MAKE_PARSER(unsigned long, parse_ulong); + #if RE2_HAVE_LONGLONG + MAKE_PARSER(long long, parse_longlong); + MAKE_PARSER(unsigned long long, parse_ulonglong); + #endif + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + +#undef MAKE_PARSER + + // Generic constructor templates + template Arg(T* p) + : arg_(p), parser_(_RE2_MatchObject::Parse) { } + template Arg(T* p, Parser parser) + : arg_(p), parser_(parser) { } + + // Parse the data + bool Parse(const char* str, int n) const; + + private: + void* arg_; + Parser parser_; + + static bool parse_null (const char* str, int n, void* dest); + static bool parse_char (const char* str, int n, void* dest); + static bool parse_uchar (const char* str, int n, void* dest); + static bool parse_float (const char* str, int n, void* dest); + static bool parse_double (const char* str, int n, void* dest); + static bool parse_string (const char* str, int n, void* dest); + static bool parse_stringpiece (const char* str, int n, void* dest); + +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_ ## name(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _radix( \ + const char* str, int n, void* dest, int radix); \ + public: \ + static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) + + DECLARE_INTEGER_PARSER(short); + DECLARE_INTEGER_PARSER(ushort); + DECLARE_INTEGER_PARSER(int); + DECLARE_INTEGER_PARSER(uint); + DECLARE_INTEGER_PARSER(long); + DECLARE_INTEGER_PARSER(ulong); + #if RE2_HAVE_LONGLONG + DECLARE_INTEGER_PARSER(longlong); + DECLARE_INTEGER_PARSER(ulonglong); + #endif + +#undef DECLARE_INTEGER_PARSER +}; + +inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } +inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } + +inline bool RE2::Arg::Parse(const char* str, int n) const { + return (*parser_)(str, n, arg_); +} + +// This part of the parser, appropriate only for ints, deals with bases +#define MAKE_INTEGER_PARSER(type, name) \ + inline RE2::Arg RE2::Hex(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \ + inline RE2::Arg RE2::Octal(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \ + inline RE2::Arg RE2::CRadix(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); } + +MAKE_INTEGER_PARSER(short, short) +MAKE_INTEGER_PARSER(unsigned short, ushort) +MAKE_INTEGER_PARSER(int, int) +MAKE_INTEGER_PARSER(unsigned int, uint) +MAKE_INTEGER_PARSER(long, long) +MAKE_INTEGER_PARSER(unsigned long, ulong) +#if RE2_HAVE_LONGLONG +MAKE_INTEGER_PARSER(long long, longlong) +MAKE_INTEGER_PARSER(unsigned long long, ulonglong) +#endif + +#undef MAKE_INTEGER_PARSER + +} // namespace re2 + +using re2::RE2; + +#endif /* RE2_RE2_H */ diff --git a/benches/src/ffi/re2.rs b/benches/src/ffi/re2.rs new file mode 100644 index 0000000000..c608dae085 --- /dev/null +++ b/benches/src/ffi/re2.rs @@ -0,0 +1,155 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![allow(non_camel_case_types)] + +use libc::{c_uchar, c_int, c_void}; + +/// Regex wraps an RE2 regular expression. +/// +/// It cannot be used safely from multiple threads simultaneously. +pub struct Regex { + re: *mut re2_regexp, +} + +unsafe impl Send for Regex {} + +impl Drop for Regex { + fn drop(&mut self) { + unsafe { re2_regexp_free(self.re); } + } +} + +#[derive(Debug)] +pub struct Error(()); + +impl Regex { + pub fn new(pattern: &str) -> Result { + unsafe { Ok(Regex { re: re2_regexp_new(pattern.into()) }) } + } + + pub fn is_match(&self, text: &str) -> bool { + unsafe { + re2_regexp_match(self.re, text.into(), 0, text.len() as c_int) + } + } + + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + FindMatches { + re: self, + text: text, + last_end: 0, + last_match: None, + } + } + + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + let (mut s, mut e): (c_int, c_int) = (0, 0); + let matched = unsafe { + re2_regexp_find( + self.re, + text.into(), + start as c_int, + text.len() as c_int, + &mut s, + &mut e, + ) + }; + if matched { + Some((s as usize, e as usize)) + } else { + None + } + } +} + +pub struct FindMatches<'r, 't> { + re: &'r Regex, + text: &'t str, + last_end: usize, + last_match: Option, +} + +// This implementation is identical to the one Rust uses, since both Rust's +// regex engine and RE2 handle empty matches in the same way. +impl<'r, 't> Iterator for FindMatches<'r, 't> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + fn next_after_empty(text: &str, i: usize) -> usize { + let b = text.as_bytes()[i]; + let inc = if b <= 0x7F { + 1 + } else if b <= 0b110_11111 { + 2 + } else if b <= 0b1110_1111 { + 3 + } else { + 4 + }; + i + inc + } + + if self.last_end > self.text.len() { + return None; + } + let (s, e) = match self.re.find_at(self.text, self.last_end) { + None => return None, + Some((s, e)) => (s, e), + }; + assert!(s >= self.last_end); + if e == s && Some(self.last_end) == self.last_match { + if self.last_end >= self.text.len() { + return None; + } + self.last_end = next_after_empty(self.text, self.last_end); + return self.next(); + } + self.last_end = e; + self.last_match = Some(self.last_end); + Some((s, e)) + } +} + +// RE2 FFI is below. Note that this uses a hand-rolled C API that is defined +// in re2.cpp. + +type re2_regexp = c_void; + +#[repr(C)] +struct re2_string { + text: *const c_uchar, + len: c_int, +} + +impl<'a> From<&'a str> for re2_string { + fn from(s: &'a str) -> re2_string { + re2_string { text: s.as_ptr(), len: s.len() as c_int } + } +} + +extern { + fn re2_regexp_new(pat: re2_string) -> *mut re2_regexp; + fn re2_regexp_free(re: *mut re2_regexp); + fn re2_regexp_match( + re: *mut re2_regexp, + text: re2_string, + startpos: c_int, + endpos: c_int, + ) -> bool; + fn re2_regexp_find( + re: *mut re2_regexp, + text: re2_string, + startpos: c_int, + endpos: c_int, + match_start: *mut c_int, + match_end: *mut c_int, + ) -> bool; +} diff --git a/benches/src/ffi/tcl.rs b/benches/src/ffi/tcl.rs new file mode 100644 index 0000000000..706678c243 --- /dev/null +++ b/benches/src/ffi/tcl.rs @@ -0,0 +1,241 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![allow(non_camel_case_types)] + +use std::mem; +use std::ptr; +use std::sync::{Once, ONCE_INIT}; + +use libc::{c_char, c_int, c_long, c_void}; + +// Used to initialize the TCL interpreter exactly once. +static ONCE: Once = ONCE_INIT; + +/// Text is a TCL string object backed by a Rust string. +/// +/// This is a special type that is created once per benchmark and is not +/// included in timings. In particular, all regex searches execute on values +/// of this type, so we're careful to avoid the overhead of creating such +/// objects on every search. +pub struct Text { + s: String, + obj: *mut tcl_obj, +} + +// TCL's objects are ref-counted in a thread-unsafe manner, which would +// normally disqualify a Send bound. However, we don't permit Text to be used +// in a way that can lead to unsafety. In particular, the ref count is always +// 1, until it is dropped, in which the ref count is decreased to zero and +// the underlying memory is freed. +unsafe impl Send for Text {} + +impl Drop for Text { + fn drop(&mut self) { + unsafe { + assert_eq!((*self.obj).ref_count, 1); + // This will drop the ref count to 0 and cause it to be freed. + (*self.obj).decr_ref_count(); + } + } +} + +impl Text { + pub fn new(text: String) -> Text { + let ptr = text.as_ptr() as *const c_char; + let len = text.len() as c_int; + let obj = unsafe { Tcl_NewStringObj(ptr, len) }; + unsafe { + (*obj).incr_ref_count(); + } + Text { s: text, obj: obj } + } + + pub fn len(&self) -> usize { + self.s.len() + } +} + +/// Regex wraps a TCL regex. It owns a TCL string object and a pointer to a +/// regexp object. The two share storage. +/// +/// There's no Drop impl for Regex because the memory for the regex will be +/// freed when `pat` is dropped. +pub struct Regex { + pat: Text, + re: *mut tcl_regexp, +} + +unsafe impl Send for Regex {} + +#[derive(Debug)] +pub struct Error(()); + +impl Regex { + pub fn new(pattern: &str) -> Result { + ONCE.call_once(|| { + unsafe { Tcl_CreateInterp(); } + }); + + let pat = Text::new(pattern.to_owned()); + let re = unsafe { + Tcl_GetRegExpFromObj(ptr::null_mut(), pat.obj, TCL_REG_ADVANCED) + }; + if re.is_null() { + return Err(Error(())); + } + Ok(Regex { + pat: pat, + re: re, + }) + } + + pub fn is_match(&self, text: &Text) -> bool { + let result = unsafe { Tcl_RegExpExecObj( + ptr::null_mut(), + self.re, + text.obj, + 0, + 1, + 0, + ) }; + if result == -1 { + panic!("Tcl_RegExpExecObj failed"); + } + result > 0 + } + + pub fn find_iter<'r, 't>(&'r self, text: &'t Text) -> FindMatches<'r, 't> { + FindMatches { + re: self, + text: text, + last_match: 0, + } + } + + fn find_at(&self, text: &Text, start: usize) -> Option<(usize, usize)> { + let result = unsafe { Tcl_RegExpExecObj( + ptr::null_mut(), + self.re, + text.obj, + start as c_int, + 1, + 0, + ) }; + if result == -1 { + panic!("Tcl_RegExpExecObj failed"); + } else if result == 0 { + return None; + } + let mut info: tcl_regexp_info = unsafe { mem::zeroed() }; + unsafe { + Tcl_RegExpGetInfo(self.re, &mut info); + let s = start as c_long + (*info.matches).start; + let e = start as c_long + (*info.matches).end; + Some((s as usize, e as usize)) + } + } +} + +pub struct FindMatches<'r, 't> { + re: &'r Regex, + text: &'t Text, + last_match: usize, +} + +impl<'r, 't> Iterator for FindMatches<'r, 't> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + match self.re.find_at(self.text, self.last_match) { + None => None, + Some((s, e)) => { + self.last_match = e; + Some((s, e)) + } + } + } +} + +// TCL's FFI. We only wrap the bits we need. + +const TCL_REG_ADVANCED: c_int = 3; + +type tcl_interp = c_void; +type tcl_regexp = c_void; + +#[repr(C)] +struct tcl_obj { + ref_count: c_int, + // There are more fields, but we don't care about them. + // We're careful to only access ref_count so we can increment/decrement it. + // This is necessary because Tcl_IncRefCount and Tcl_DecrRefCount are + // macros. +} + +impl tcl_obj { + unsafe fn incr_ref_count(&mut self) { + self.ref_count += 1; + } + + unsafe fn decr_ref_count(&mut self) { + self.ref_count -= 1; + if self.ref_count <= 0 { + TclFreeObj(self); + } + } +} + +#[repr(C)] +struct tcl_regexp_info { + nsubs: c_int, + matches: *mut tcl_regexp_indices, + extend_start: c_long, + reserved: c_long, +} + +#[repr(C)] +struct tcl_regexp_indices { + start: c_long, + end: c_long, +} + +extern { + fn Tcl_CreateInterp() -> *mut tcl_interp; + + fn Tcl_NewStringObj( + pat: *const c_char, + len: c_int, + ) -> *mut tcl_obj; + + fn TclFreeObj( + obj: *mut tcl_obj, + ); + + fn Tcl_GetRegExpFromObj( + int: *mut tcl_interp, + pat: *mut tcl_obj, + flags: c_int, + ) -> *mut tcl_regexp; + + fn Tcl_RegExpExecObj( + int: *mut tcl_interp, + re: *mut tcl_regexp, + text: *mut tcl_obj, + offset: c_int, + nmatches: c_int, + flags: c_int, + ) -> c_int; + + fn Tcl_RegExpGetInfo( + re: *mut tcl_regexp, + info: *mut tcl_regexp_info, + ); +} diff --git a/benches/src/main.rs b/benches/src/main.rs new file mode 100644 index 0000000000..af605124b4 --- /dev/null +++ b/benches/src/main.rs @@ -0,0 +1,157 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate docopt; +#[macro_use] +extern crate lazy_static; +extern crate libc; +#[cfg(feature = "re-pcre1")] +extern crate libpcre_sys; +extern crate memmap; +#[cfg(feature = "re-onig")] +extern crate onig; +#[cfg(any( + feature = "re-rust", + feature = "re-rust-bytes", + feature = "re-rust-plugin", + ))] +extern crate regex; +#[cfg(feature = "re-rust")] +extern crate regex_syntax; +extern crate rustc_serialize; + +use std::str; + +use docopt::Docopt; +use memmap::{Mmap, Protection}; + +mod ffi; + +const USAGE: &'static str = " +Count the number of matches of in . + +This compiles the pattern once and counts all successive non-overlapping +matches in . is memory mapped. Matching is done as if were +a single string (it is not line oriented). + +Since this tool includes compilation of the , sufficiently large +haystacks should be used to amortize the cost of compilation. (e.g., >1MB.) + +Usage: + regex-run-one [options] [onig | pcre1 | pcre2 | re2 | rust | rust-bytes | tcl] + +Options: + -h, --help Show this usage message. +"; + +#[derive(Debug, RustcDecodable)] +struct Args { + arg_pattern: String, + arg_file: String, + cmd_onig: bool, + cmd_pcre1: bool, + cmd_pcre2: bool, + cmd_re2: bool, + cmd_rust: bool, + cmd_rust_bytes: bool, + cmd_tcl: bool, +} + +fn main() { + let args: Args = Docopt::new(USAGE).and_then(|d| d.decode()) + .unwrap_or_else(|e| e.exit()); + + let mmap = Mmap::open_path(&args.arg_file, Protection::Read).unwrap(); + let haystack = unsafe { str::from_utf8(mmap.as_slice()).unwrap() }; + + println!("{}", args.count(&haystack)); +} + +impl Args { + fn count(&self, haystack: &str) -> usize { + let pat = &self.arg_pattern; + if self.cmd_onig { + count_onig(pat, haystack) + } else if self.cmd_pcre1 { + count_pcre1(pat, haystack) + } else if self.cmd_pcre2 { + count_pcre2(pat, haystack) + } else if self.cmd_re2 { + count_re2(pat, haystack) + } else if self.cmd_rust { + count_rust(pat, haystack) + } else if self.cmd_rust_bytes { + count_rust_bytes(pat, haystack) + } else if self.cmd_tcl { + count_tcl(pat, haystack) + } else { + panic!("unreachable") + } + } +} + +macro_rules! nada { + ($feature:expr, $name:ident) => { + #[cfg(not(feature = $feature))] + fn $name(_pat: &str, _haystack: &str) -> usize { + panic!("Support not enabled. Re-compile with '--features {}' \ + to enable.", $feature) + } + } +} + +nada!("re-onig", count_onig); +#[cfg(feature = "re-onig")] +fn count_onig(pat: &str, haystack: &str) -> usize { + use ffi::onig::Regex; + Regex::new(pat).unwrap().find_iter(haystack).count() +} + +nada!("re-pcre1", count_pcre1); +#[cfg(feature = "re-pcre1")] +fn count_pcre1(pat: &str, haystack: &str) -> usize { + use ffi::pcre1::Regex; + Regex::new(pat).unwrap().find_iter(haystack).count() +} + +nada!("re-pcre2", count_pcre2); +#[cfg(feature = "re-pcre2")] +fn count_pcre2(pat: &str, haystack: &str) -> usize { + use ffi::pcre2::Regex; + Regex::new(pat).unwrap().find_iter(haystack).count() +} + +nada!("re-re2", count_re2); +#[cfg(feature = "re-re2")] +fn count_re2(pat: &str, haystack: &str) -> usize { + use ffi::re2::Regex; + Regex::new(pat).unwrap().find_iter(haystack).count() +} + +nada!("re-rust", count_rust); +#[cfg(feature = "re-rust")] +fn count_rust(pat: &str, haystack: &str) -> usize { + use regex::Regex; + Regex::new(pat).unwrap().find_iter(haystack).count() +} + +nada!("re-rust-bytes", count_rust_bytes); +#[cfg(feature = "re-rust-bytes")] +fn count_rust_bytes(pat: &str, haystack: &str) -> usize { + use regex::bytes::Regex; + Regex::new(pat).unwrap().find_iter(haystack.as_bytes()).count() +} + +nada!("re-tcl", count_tcl); +#[cfg(feature = "re-tcl")] +fn count_tcl(pat: &str, haystack: &str) -> usize { + use ffi::tcl::{Regex, Text}; + Regex::new(pat).unwrap().find_iter(&Text::new(haystack.to_owned())).count() +} diff --git a/benches/src/misc.rs b/benches/src/misc.rs index 41445fc76d..27c6ab2d43 100644 --- a/benches/src/misc.rs +++ b/benches/src/misc.rs @@ -14,59 +14,11 @@ use std::iter::repeat; use test::Bencher; -use Regex; - -#[cfg(not(feature = "re-rust-bytes"))] -macro_rules! text { ($text:expr) => { $text } } -#[cfg(feature = "re-rust-bytes")] -macro_rules! text { ($text:expr) => { $text.as_bytes() } } - -macro_rules! bench_match { - ($name:ident, $re:expr, $text:expr) => { - #[bench] - fn $name(b: &mut Bencher) { - #![allow(unused_mut)] - use std::sync::Mutex; - - lazy_static! { - static ref RE: Mutex = Mutex::new($re); - static ref TEXT: String = $text; - }; - let mut re = RE.lock().unwrap(); - b.bytes = TEXT.len() as u64; - b.iter(|| { - if !re.is_match(text!(&TEXT)) { - panic!("expected match, got not match"); - } - }); - } - } -} - -macro_rules! bench_nomatch { - ($name:ident, $re:expr, $text:expr) => { - #[bench] - fn $name(b: &mut Bencher) { - #![allow(unused_mut)] - use std::sync::Mutex; - - lazy_static! { - static ref RE: Mutex = Mutex::new($re); - static ref TEXT: String = $text; - }; - let mut re = RE.lock().unwrap(); - b.bytes = TEXT.len() as u64; - b.iter(|| { - if re.is_match(text!(&TEXT)) { - panic!("match not expected"); - } - }); - } - } -} +use {Regex, Text}; +/* #[cfg(not(feature = "re-onig"))] -#[cfg(not(feature = "re-pcre"))] +#[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] #[cfg(not(feature = "re-rust-plugin"))] bench_match!(no_exponential, { @@ -74,68 +26,69 @@ bench_match!(no_exponential, { "{}{}", repeat("a?").take(100).collect::(), repeat("a").take(100).collect::()); - // We don't use the macro here since we're dynamically building the regex. - Regex::new(&re).unwrap() + regex!(&re) }, repeat("a").take(100).collect()); +*/ -bench_match!(literal, regex!("y"), { +bench_match!(literal, r"y", { format!("{}y", repeat("x").take(50).collect::()) }); -bench_match!(not_literal, regex!(".y"), { +bench_match!(not_literal, r".y", { format!("{}y", repeat("x").take(50).collect::()) }); -bench_match!(match_class, regex!("[abcdw]"), { +bench_match!(match_class, "[abcdw]", { format!("{}w", repeat("xxxx").take(20).collect::()) }); -bench_match!(match_class_in_range, regex!("[ac]"), { +bench_match!(match_class_in_range, "[ac]", { format!("{}c", repeat("bbbb").take(20).collect::()) }); #[cfg(not(feature = "re-rust-bytes"))] -bench_match!(match_class_unicode, regex!(r"\p{L}"), { +#[cfg(not(feature = "re-tcl"))] +bench_match!(match_class_unicode, r"\p{L}", { format!("{}a", repeat("☃5☃5").take(20).collect::()) }); -bench_nomatch!(anchored_literal_short_non_match, regex!("^zbc(d|e)"), { +bench_not_match!(anchored_literal_short_non_match, r"^zbc(d|e)", { "abcdefghijklmnopqrstuvwxyz".to_owned() }); -bench_nomatch!(anchored_literal_long_non_match, regex!("^zbc(d|e)"), { +bench_not_match!(anchored_literal_long_non_match, r"^zbc(d|e)", { repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::() }); -bench_match!(anchored_literal_short_match, regex!("^.bc(d|e)"), { +bench_match!(anchored_literal_short_match, r"^.bc(d|e)", { "abcdefghijklmnopqrstuvwxyz".to_owned() }); -bench_match!(anchored_literal_long_match, regex!("^.bc(d|e)"), { +bench_match!(anchored_literal_long_match, r"^.bc(d|e)", { repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::() }); -bench_match!(one_pass_short, regex!("^.bc(d|e)*$"), { +bench_match!(one_pass_short, r"^.bc(d|e)*$", { "abcddddddeeeededd".to_owned() }); -bench_match!(one_pass_short_not, regex!(".bc(d|e)*$"), { +bench_match!(one_pass_short_not, r".bc(d|e)*$", { "abcddddddeeeededd".to_owned() }); -bench_match!(one_pass_long_prefix, regex!("^abcdefghijklmnopqrstuvwxyz.*$"), { +bench_match!(one_pass_long_prefix, r"^abcdefghijklmnopqrstuvwxyz.*$", { "abcdefghijklmnopqrstuvwxyz".to_owned() }); -bench_match!(one_pass_long_prefix_not, regex!("^.bcdefghijklmnopqrstuvwxyz.*$"), { +bench_match!(one_pass_long_prefix_not, r"^.bcdefghijklmnopqrstuvwxyz.*$", { "abcdefghijklmnopqrstuvwxyz".to_owned() }); -bench_match!(long_needle1, regex!("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"), { +bench_match!(long_needle1, r"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", { repeat("a").take(100_000).collect::() + "b" }); -bench_match!(long_needle2, regex!("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbba"), { +bench_match!(long_needle2, r"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbba", { repeat("b").take(100_000).collect::() + "a" }); @@ -147,10 +100,10 @@ fn replace_all(b: &mut Bencher) { b.iter(|| re.replace_all(text, "")); } -const TXT_32: &'static str = include_str!("32.txt"); -const TXT_1K: &'static str = include_str!("1K.txt"); -const TXT_32K: &'static str = include_str!("32K.txt"); -const TXT_1MB: &'static str = include_str!("1MB.txt"); +const TXT_32: &'static str = include_str!("data/32.txt"); +const TXT_1K: &'static str = include_str!("data/1K.txt"); +const TXT_32K: &'static str = include_str!("data/32K.txt"); +const TXT_1MB: &'static str = include_str!("data/1MB.txt"); fn get_text(corpus: &str, suffix: String) -> String { let mut corpus = corpus.to_string(); @@ -158,73 +111,78 @@ fn get_text(corpus: &str, suffix: String) -> String { corpus } -fn easy0() -> Regex { - regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") -} - fn easy0_suffix() -> String { "ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string() } -bench_match!(easy0_32, easy0(), get_text(TXT_32, easy0_suffix())); -bench_match!(easy0_1K, easy0(), get_text(TXT_1K, easy0_suffix())); -bench_match!(easy0_32K, easy0(), get_text(TXT_32K, easy0_suffix())); -bench_match!(easy0_1MB, easy0(), get_text(TXT_1MB, easy0_suffix())); +macro_rules! easy0 { () => ("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } -fn easy1() -> Regex { - regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") -} +bench_match!(easy0_32, easy0!(), get_text(TXT_32, easy0_suffix())); +bench_match!(easy0_1K, easy0!(), get_text(TXT_1K, easy0_suffix())); +bench_match!(easy0_32K, easy0!(), get_text(TXT_32K, easy0_suffix())); +bench_match!(easy0_1MB, easy0!(), get_text(TXT_1MB, easy0_suffix())); fn easy1_suffix() -> String { "AABCCCDEEEFGGHHHIJJ".to_string() } -bench_match!(easy1_32, easy1(), get_text(TXT_32, easy1_suffix())); -bench_match!(easy1_1K, easy1(), get_text(TXT_1K, easy1_suffix())); -bench_match!(easy1_32K, easy1(), get_text(TXT_32K, easy1_suffix())); -bench_match!(easy1_1MB, easy1(), get_text(TXT_1MB, easy1_suffix())); - -fn medium() -> Regex { - regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") +macro_rules! easy1 { + () => (r"A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") } +bench_match!(easy1_32, easy1!(), get_text(TXT_32, easy1_suffix())); +bench_match!(easy1_1K, easy1!(), get_text(TXT_1K, easy1_suffix())); +bench_match!(easy1_32K, easy1!(), get_text(TXT_32K, easy1_suffix())); +bench_match!(easy1_1MB, easy1!(), get_text(TXT_1MB, easy1_suffix())); + fn medium_suffix() -> String { "XABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string() } -bench_match!(medium_32, medium(), get_text(TXT_32, medium_suffix())); -bench_match!(medium_1K, medium(), get_text(TXT_1K, medium_suffix())); -bench_match!(medium_32K, medium(), get_text(TXT_32K, medium_suffix())); -bench_match!(medium_1MB, medium(), get_text(TXT_1MB, medium_suffix())); +macro_rules! medium { () => (r"[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } -fn hard() -> Regex { - regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") -} +bench_match!(medium_32, medium!(), get_text(TXT_32, medium_suffix())); +bench_match!(medium_1K, medium!(), get_text(TXT_1K, medium_suffix())); +bench_match!(medium_32K, medium!(), get_text(TXT_32K, medium_suffix())); +bench_match!(medium_1MB, medium!(), get_text(TXT_1MB, medium_suffix())); fn hard_suffix() -> String { "ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string() } -bench_match!(hard_32, hard(), get_text(TXT_32, hard_suffix())); -bench_match!(hard_1K, hard(), get_text(TXT_1K, hard_suffix())); -bench_match!(hard_32K, hard(), get_text(TXT_32K, hard_suffix())); -bench_match!(hard_1MB, hard(), get_text(TXT_1MB, hard_suffix())); +macro_rules! hard { () => (r"[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } -fn reallyhard() -> Regex { - // The point of this being "really" hard is that it should completely - // thwart any prefix or suffix literal optimizations. - regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ.*") -} +bench_match!(hard_32, hard!(), get_text(TXT_32, hard_suffix())); +bench_match!(hard_1K, hard!(), get_text(TXT_1K, hard_suffix())); +bench_match!(hard_32K, hard!(), get_text(TXT_32K, hard_suffix())); +bench_match!(hard_1MB, hard!(), get_text(TXT_1MB, hard_suffix())); fn reallyhard_suffix() -> String { "ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string() } -bench_match!(reallyhard_32, reallyhard(), +macro_rules! reallyhard { + () => { + // The point of this being "really" hard is that it should completely + // thwart any prefix or suffix literal optimizations. + r"[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ.*" + } +} + +bench_match!(reallyhard_32, reallyhard!(), get_text(TXT_32, reallyhard_suffix())); -bench_match!(reallyhard_1K, reallyhard(), +bench_match!(reallyhard_1K, reallyhard!(), get_text(TXT_1K, reallyhard_suffix())); -bench_match!(reallyhard_32K, reallyhard(), +bench_match!(reallyhard_32K, reallyhard!(), get_text(TXT_32K, reallyhard_suffix())); -bench_match!(reallyhard_1MB, reallyhard(), +bench_match!(reallyhard_1MB, reallyhard!(), get_text(TXT_1MB, reallyhard_suffix())); + +fn reallyhard2_suffix() -> String { + "Sherlock Holmes".to_string() +} + +macro_rules! reallyhard2 { () => (r"\w+\s+Holmes") } + +bench_match!(reallyhard2_1K, reallyhard2!(), + get_text(TXT_1K, reallyhard2_suffix())); diff --git a/benches/src/sherlock.rs b/benches/src/sherlock.rs index e84e6911a3..8c704ae07d 100644 --- a/benches/src/sherlock.rs +++ b/benches/src/sherlock.rs @@ -10,180 +10,167 @@ use test::Bencher; -use Regex; +use {Regex, Text}; -#[cfg(not(feature = "re-rust-bytes"))] -lazy_static! { - static ref SHERLOCK: String = { - include_str!("the-adventures-of-sherlock-holmes.txt").to_owned() - }; -} - -#[cfg(feature = "re-rust-bytes")] -lazy_static! { - static ref SHERLOCK: Vec = { - include_bytes!("the-adventures-of-sherlock-holmes.txt")[..].to_owned() - }; -} -macro_rules! bench_find { - ($name:ident, $re:expr, $count:expr) => { - #[bench] - fn $name(b: &mut Bencher) { - #![allow(unused_mut)] - - use std::sync::Mutex; - - lazy_static! { - static ref RE: Mutex = Mutex::new($re); - }; - let mut re = RE.lock().unwrap(); - b.bytes = SHERLOCK.len() as u64; - b.iter(|| { - let count = re.find_iter(&SHERLOCK).count(); - assert_eq!($count, count) - }); - } +// USAGE: sherlock!(name, pattern, count) +// +// This is same as bench_find, except it always uses the Sherlock haystack. +macro_rules! sherlock { + ($name:ident, $pattern:expr, $count:expr) => { + bench_find!( + $name, $pattern, $count, + include_str!("data/sherlock.txt").to_owned() + ); } } // These patterns are all single string literals that compile down to a variant // of Boyer-Moore w/ memchr. This also demonstrates the impact that the // frequency of a match has on performance. -bench_find!(name_sherlock, regex!("Sherlock"), 97); -bench_find!(name_holmes, regex!("Holmes"), 461); -bench_find!(name_sherlock_holmes, regex!("Sherlock Holmes"), 91); +sherlock!(name_sherlock, r"Sherlock", 97); +sherlock!(name_holmes, r"Holmes", 461); +sherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91); // Like the above, except case insensitively. The prefix detector will extract // multiple *cut* prefix literals for each of the following before hitting its // limit. All of these should be able to use either memchr2 or memchr3. -bench_find!(name_sherlock_nocase, regex!("(?i)Sherlock"), 102); -bench_find!(name_holmes_nocase, regex!("(?i)Holmes"), 467); -bench_find!(name_sherlock_holmes_nocase, regex!("(?i)Sherlock Holmes"), 96); +sherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102); +sherlock!(name_holmes_nocase, r"(?i)Holmes", 467); +sherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96); // Will quickly find instances of 'Sherlock', but then needs to fall back to // the lazy DFA to process the Unicode aware `\s`. -bench_find!(name_whitespace, regex!(r"Sherlock\s+Holmes"), 97); +sherlock!(name_whitespace, r"Sherlock\s+Holmes", 97); // Now try more variations on name matching. // This one has two alternates that both start with 'S'. This should compile // to an Aho-Corasick automaton that uses memchr. Never enters lazy DFA. -bench_find!(name_alt1, regex!("Sherlock|Street"), 158); +sherlock!(name_alt1, r"Sherlock|Street", 158); // This one doesn't have a common byte, but should still use Aho-Corasick and // memchr2. // Never enters lazy DFA. -bench_find!(name_alt2, regex!("Sherlock|Holmes"), 558); +sherlock!(name_alt2, r"Sherlock|Holmes", 558); // Still using Aho-Corasick, but more patterns. Never enters lazy DFA but // also can't use any memchr variant. -bench_find!( - name_alt3, - regex!("Sherlock|Holmes|Watson|Irene|Adler|John|Baker"), 740); +sherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740); // Still using Aho-Corasick, but needs the lazy DFA. -bench_find!( +sherlock!( name_alt3_nocase, - regex!("(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"), 753); + r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker", + 753); // Should still use Aho-Corasick for the prefixes in each alternate, but // we need to use the lazy DFA to complete it. -bench_find!(name_alt4, regex!("Sher[a-z]+|Hol[a-z]+"), 582); -bench_find!(name_alt4_nocase, regex!("(?i)Sher[a-z]+|Hol[a-z]+"), 697); +sherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582); +sherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697); // Uses Aho-Corasick, but can use memchr3 (unlike name_alt3). -bench_find!(name_alt5, regex!("Sherlock|Holmes|Watson"), 639); -bench_find!(name_alt5_nocase, regex!("(?i)Sherlock|Holmes|Watson"), 650); +sherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639); +sherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650); // How long does it take to discover that there's no match? In the first two // cases, we detect the rarest byte in the literal to run memchr on. In the // first, it's 'z' and in the second it's 'j'. The third case only has common // letters, and is therefore slower. -bench_find!(no_match_uncommon, regex!("zqj"), 0); -bench_find!(no_match_common, regex!("aqj"), 0); -bench_find!(no_match_really_common, regex!("aei"), 0); +sherlock!(no_match_uncommon, r"zqj", 0); +sherlock!(no_match_common, r"aqj", 0); +sherlock!(no_match_really_common, r"aei", 0); // Various twiddling on very common words. This tends to stress the constant // overhead of actually reporting a match. (None of these actually enter any // matching engines.) -bench_find!(the_lower, regex!("the"), 7218); -bench_find!(the_upper, regex!("The"), 741); -bench_find!(the_nocase, regex!("(?i)the"), 7987); +sherlock!(the_lower, r"the", 7218); +sherlock!(the_upper, r"The", 741); +sherlock!(the_nocase, r"(?i)the", 7987); // Process whitespace after a very common word. // Uses Boyer-Moore to find `the` and the lazy DFA for the rest. -bench_find!(the_whitespace, regex!(r"the\s+\w+"), 5410); +sherlock!(the_whitespace, r"the\s+\w+", 5410); // How fast can we match everything? This essentially defeats any clever prefix // tricks and just executes the DFA across the entire input. -#[cfg(not(feature = "re-pcre"))] +#[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] -bench_find!(everything_greedy, regex!(".*"), 13053); +#[cfg(not(feature = "re-tcl"))] +sherlock!(everything_greedy, r".*", 13053); #[cfg(not(feature = "re-onig"))] -#[cfg(not(feature = "re-pcre"))] +#[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] -bench_find!(everything_greedy_nl, regex!("(?s).*"), 1); +#[cfg(not(feature = "re-tcl"))] +sherlock!(everything_greedy_nl, r"(?s).*", 1); // How fast can we match every letter? This also defeats any clever prefix -// tricks. Weird. Looks like PCRE2 diverges. Not clear who is right... +// tricks. +#[cfg(not(feature = "re-tcl"))] #[cfg(not(feature = "re-rust-bytes"))] -bench_find!(letters, regex!(r"\p{L}"), 447160); +sherlock!(letters, r"\p{L}", 447160); +#[cfg(not(feature = "re-tcl"))] #[cfg(feature = "re-rust-bytes")] -bench_find!(letters, regex!(r"(?u)\p{L}"), 447160); +sherlock!(letters, r"(?u)\p{L}", 447160); +#[cfg(not(feature = "re-tcl"))] #[cfg(not(feature = "re-rust-bytes"))] -bench_find!(letters_upper, regex!(r"\p{Lu}"), 14180); +sherlock!(letters_upper, r"\p{Lu}", 14180); +#[cfg(not(feature = "re-tcl"))] #[cfg(feature = "re-rust-bytes")] -bench_find!(letters_upper, regex!(r"(?u)\p{Lu}"), 14180); +sherlock!(letters_upper, r"(?u)\p{Lu}", 14180); +#[cfg(not(feature = "re-tcl"))] #[cfg(not(feature = "re-rust-bytes"))] -bench_find!(letters_lower, regex!(r"\p{Ll}"), 432980); +sherlock!(letters_lower, r"\p{Ll}", 432980); +#[cfg(not(feature = "re-tcl"))] #[cfg(feature = "re-rust-bytes")] -bench_find!(letters_lower, regex!(r"(?u)\p{Ll}"), 432980); +sherlock!(letters_lower, r"(?u)\p{Ll}", 432980); // Similarly, for words. #[cfg(not(feature = "re-rust-bytes"))] -bench_find!(words, regex!(r"\w+"), 109214); +#[cfg(not(feature = "re-re2"))] +sherlock!(words, r"\w+", 109214); #[cfg(feature = "re-rust-bytes")] -bench_find!(words, regex!(r"(?u)\w+"), 109214); +sherlock!(words, r"(?u)\w+", 109214); +#[cfg(feature = "re-re2")] +sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here? // Find complete words before Holmes. The `\w` defeats any prefix -// optimizations, but 'Holmes' triggers the reverse suffix optimization. -bench_find!(before_holmes, regex!(r"\w+\s+Holmes"), 319); +// optimizations. +sherlock!(before_holmes, r"\w+\s+Holmes", 319); // Find Holmes co-occuring with Watson in a particular window of characters. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. -bench_find!( - holmes_cochar_watson, - regex!(r"Holmes.{0,25}Watson|Watson.{0,25}Holmes"), 7); +sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7); // Find Holmes co-occuring with Watson in a particular window of words. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. #[cfg(not(feature = "re-onig"))] -#[cfg(not(feature = "re-pcre"))] +#[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] -bench_find!( +#[cfg(not(feature = "re-tcl"))] +sherlock!( holmes_coword_watson, - regex!(r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes"), + r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes", 51); // Find some subset of quotes in the text. // This does detect the `"` or `'` prefix literal and does a quick scan for // either byte before starting the lazy DFA. -bench_find!(quotes, regex!(r#"["'][^"']{0,30}[?!.]["']"#), 767); +sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767); // Finds all occurrences of Sherlock Holmes at the beginning or end of a line. // The empty assertions defeat any detection of prefix literals, so it's the // lazy DFA the entire way. -bench_find!( +sherlock!( line_boundary_sherlock_holmes, - regex!(r"(?m)^Sherlock Holmes|Sherlock Holmes$"), 34); + r"(?m)^Sherlock Holmes|Sherlock Holmes$", + 34); -// All words ending in `n`. -// This uses word boundaries, which the lazy DFA cannot handle. Since the word -// boundary also defeats finding any literal prefixes, we have to use the -// NFA algorithm the whole way, which is quite slow. -// -// Unless we're using bytes::Regex, which will use an ASCII word boundary, -// which the DFA can indeed handle. -bench_find!(word_ending_n, regex!(r"\b\w+n\b"), 8366); +// All words ending in `n`. This uses Unicode word boundaries, which the DFA +// can speculatively handle. Since this benchmark is on mostly ASCII text, it +// performs well here. A different benchmark with non-Western text would be +// more revealing since the search would be forced to fall back to an NFA +// simulation. +#[cfg(not(feature = "re-tcl"))] +sherlock!(word_ending_n, r"\b\w+n\b", 8366); // This is a real bad one for Rust's engine. This particular expression // fills the state cache quite frequently, which results in a lot of churn. @@ -192,11 +179,15 @@ bench_find!(word_ending_n, regex!(r"\b\w+n\b"), 8366); // // Its only salvation is that the DFA realizes it's executing slowly, gives up // quickly and falls back to the NFA algorithm. -bench_find!(repeated_class_negation, regex!(r"[a-q][^u-z]{13}x"), 142); +// +// RE2 seems to do a worse job at this than Rust. So much so that it's slow +// enough to be annoying, so we disable it. +#[cfg(not(feature = "re-re2"))] +sherlock!(repeated_class_negation, r"[a-q][^u-z]{13}x", 142); // This defeats any prefix optimizations but triggers the reverse suffix // optimization. -bench_find!(ing_suffix, regex!(r"[a-zA-Z]+ing"), 2824); +sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824); // Similar to ing_suffix, but a little more complex by limiting the length // of the word and making sure it's surrounded by whitespace. The trailing @@ -208,4 +199,4 @@ bench_find!(ing_suffix, regex!(r"[a-zA-Z]+ing"), 2824); // Interestingly, this is slower in the rust-bytes benchmark, presumably // because scanning for one of the bytes in the Unicode *unaware* `\s` ends // up being slower than avoiding the prefix scan at all. -bench_find!(ing_suffix_limited_space, regex!(r"\s[a-zA-Z]{0,12}ing\s"), 2081); +sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081); diff --git a/run-bench b/run-bench deleted file mode 100755 index c3880a7fbf..0000000000 --- a/run-bench +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -usage() { - echo "Usage: $(basename $0) [rust | rust-bytes | rust-plugin | pcre | onig]" >&2 - exit 1 -} - -if [ $# = 0 ] || [ $1 = '-h' ] || [ $1 = '--help' ]; then - usage -fi - -which="$1" -shift -case $which in - rust) - exec cargo bench \ - --manifest-path benches/Cargo.toml \ - --bench rust \ - --features re-rust \ - "$@" - ;; - rust-bytes) - exec cargo bench \ - --manifest-path benches/Cargo.toml \ - --bench rust-bytes \ - --features re-rust-bytes \ - "$@" - ;; - rust-plugin) - exec cargo bench \ - --manifest-path benches/Cargo.toml \ - --bench rust-plugin \ - --features re-rust-plugin \ - "$@" - ;; - pcre) - exec cargo bench \ - --manifest-path benches/Cargo.toml \ - --bench pcre \ - --features re-pcre \ - "$@" - ;; - pcre2) - exec cargo bench \ - --manifest-path benches/Cargo.toml \ - --bench pcre2 \ - --features re-pcre2 \ - "$@" - ;; - onig|oniguruma) - exec cargo bench \ - --manifest-path benches/Cargo.toml \ - --bench onig \ - --features re-onig \ - "$@" - ;; - *) - usage - ;; -esac diff --git a/src/exec.rs b/src/exec.rs index 37db773eb9..46d885c22f 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -253,7 +253,17 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> { fn slots_len(&self) -> usize { self.0.slots_len() } fn next_after_empty(&self, text: &str, i: usize) -> usize { - i + text[i..].chars().next().unwrap().len_utf8() + let b = text.as_bytes()[i]; + let inc = if b <= 0x7F { + 1 + } else if b <= 0b110_11111 { + 2 + } else if b <= 0b1110_1111 { + 3 + } else { + 4 + }; + i + inc } #[inline(always)] // reduces constant overhead