diff --git a/.travis.yml b/.travis.yml index c3d5321e0c..f1a1d71c0f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,3 +26,7 @@ env: notifications: email: on_success: never +branches: + only: + - master + - auto diff --git a/Cargo.toml b/Cargo.toml index a77c1495a0..2745670ace 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" readme = "README.md" repository = "https://github.com/rust-lang/regex" -documentation = "https://doc.rust-lang.org/regex" +documentation = "https://docs.rs/regex" homepage = "https://github.com/rust-lang/regex" description = """ An implementation of regular expressions for Rust. This implementation uses @@ -17,6 +17,9 @@ categories = ["text-processing"] travis-ci = { repository = "rust-lang/regex" } appveyor = { repository = "rust-lang-libs/regex" } +[workspace] +members = ["bench", "regex-capi", "regex-debug", "regex-syntax"] + [dependencies] # For very fast prefix literal matching. aho-corasick = "0.6.0" @@ -27,7 +30,7 @@ thread_local = "0.3.2" # For parsing regular expressions. regex-syntax = { path = "regex-syntax", version = "0.4.1" } # For accelerating text search. -simd = { version = "0.1.1", optional = true } +simd = { version = "0.2.1", optional = true } # For compiling UTF-8 decoding into automata. utf8-ranges = "1.0.0" @@ -35,9 +38,9 @@ utf8-ranges = "1.0.0" # For examples. lazy_static = "1" # For property based tests. -quickcheck = { version = "0.5", default-features = false } +quickcheck = { version = "0.6", default-features = false } # For generating random test data. -rand = "0.3.15" +rand = "0.4" [features] # Enable to use the unstable pattern traits defined in std. @@ -94,5 +97,11 @@ name = "backtrack-utf8bytes" path = "tests/test_backtrack_bytes.rs" name = "backtrack-bytes" +[profile.release] +debug = true + +[profile.bench] +debug = true + [profile.test] debug = true diff --git a/HACKING.md b/HACKING.md index 9556de6ecc..b216d0d1fc 100644 --- a/HACKING.md +++ b/HACKING.md @@ -185,37 +185,36 @@ A regular expression program is essentially a sequence of opcodes produced by the compiler plus various facts about the regular expression (such as whether it is anchored, its capture names, etc.). -### The regex! macro (or why `regex::internal` exists) - -The `regex!` macro is defined in the `regex_macros` crate as a compiler plugin, -which is maintained in this repository. The `regex!` macro compiles a regular -expression at compile time into specialized Rust code. - -The `regex!` macro was written when this library was first conceived and -unfortunately hasn't changed much since then. In particular, it encodes the -entire Pike VM into stack allocated space (no heap allocation is done). When -`regex!` was first written, this provided a substantial speed boost over -so-called "dynamic" regexes compiled at runtime, and in particular had much -lower overhead per match. This was because the only matching engine at the -time was the Pike VM. The addition of other matching engines has inverted -the relationship; the `regex!` macro is almost never faster than the dynamic -variant. (In fact, it is typically substantially slower.) - -In order to build the `regex!` macro this way, it must have access to some -internals of the regex library, which is in a distinct crate. (Compiler plugins -must be part of a distinct crate.) Namely, it must be able to compile a regular -expression and access its opcodes. The necessary internals are exported as part -of the top-level `internal` module in the regex library, but is hidden from -public documentation. In order to present a uniform API between programs build -by the `regex!` macro and their dynamic analoges, the `Regex` type is an enum -whose variants are hidden from public documentation. - -In the future, the `regex!` macro should probably work more like Ragel, but -it's not clear how hard this is. In particular, the `regex!` macro should be -able to support all the features of dynamic regexes, which may be hard to do -with a Ragel-style implementation approach. (Which somewhat suggests that the -`regex!` macro may also need to grow conditional execution logic like the -dynamic variants, which seems rather grotesque.) +### The regex! macro + +The `regex!` macro no longer exists. It was developed in a bygone era as a +compiler plugin during the infancy of the regex crate. Back then, then only +matching engine in the crate was the Pike VM. The `regex!` macro was, itself, +also a Pike VM. The only advantages it offered over the dynamic Pike VM that +was built at runtime were the following: + + 1. Syntax checking was done at compile time. Your Rust program wouldn't + compile if your regex didn't compile. + 2. Reduction of overhead that was proportional to the size of the regex. + For the most part, this overhead consisted of heap allocation, which + was nearly eliminated in the compiler plugin. + +The main takeaway here is that the compiler plugin was a marginally faster +version of a slow regex engine. As the regex crate evolved, it grew other regex +engines (DFA, bounded backtracker) and sophisticated literal optimizations. +The regex macro didn't keep pace, and it therefore became (dramatically) slower +than the dynamic engines. The only reason left to use it was for the compile +time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint +tool) has a lint that checks your regular expression validity, which mostly +replaces that use case. + +Additionally, the regex compiler plugin stopped receiving maintenance. Nobody +complained. At that point, it seemed prudent to just remove it. + +Will a compiler plugin be brought back? The future is murky, but there is +definitely an opportunity there to build something that is faster than the +dynamic engines in some cases. But it will be challenging! As of now, there +are no plans to work on this. ## Testing @@ -236,7 +235,6 @@ the AT&T test suite) and code generate tests for each matching engine. The approach we use in this library is to create a Cargo.toml entry point for each matching engine we want to test. The entry points are: -* `tests/test_plugin.rs` - tests the `regex!` macro * `tests/test_default.rs` - tests `Regex::new` * `tests/test_default_bytes.rs` - tests `bytes::Regex::new` * `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA @@ -261,18 +259,14 @@ entry points, it can take a while to compile everything. To reduce compile times slightly, try using `cargo test --test default`, which will only use the `tests/test_default.rs` entry point. -N.B. To run tests for the `regex!` macro, use: - - cargo test --manifest-path regex_macros/Cargo.toml - ## Benchmarking The benchmarking in this crate is made up of many micro-benchmarks. Currently, there are two primary sets of benchmarks: the benchmarks that were adopted -at this library's inception (in `benches/src/misc.rs`) and a newer set of +at this library's inception (in `bench/src/misc.rs`) and a newer set of benchmarks meant to test various optimizations. Specifically, the latter set -contain some analysis and are in `benches/src/sherlock.rs`. Also, the latter +contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter set are all executed on the same lengthy input whereas the former benchmarks are executed on strings of varying length. @@ -284,7 +278,6 @@ separately from the main regex crate. Benchmarking follows a similarly wonky setup as tests. There are multiple entry points: -* `bench_rust_plugin.rs` - benchmarks the `regex!` macro * `bench_rust.rs` - benchmarks `Regex::new` * `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` * `bench_pcre.rs` - benchmarks PCRE @@ -299,20 +292,20 @@ library benchmarks (especially RE2). If you're hacking on one of the matching engines and just want to see benchmarks, then all you need to run is: - $ ./run-bench rust + $ ./bench/run rust If you want to compare your results with older benchmarks, then try: - $ ./run-bench rust | tee old + $ ./bench/run rust | tee old $ ... make it faster - $ ./run-bench rust | tee new - $ cargo-benchcmp old new --improvements + $ ./bench/run rust | tee new + $ cargo benchcmp old new --improvements The `cargo-benchcmp` utility is available here: https://github.com/BurntSushi/cargo-benchcmp -The `run-bench` utility can run benchmarks for PCRE and Oniguruma too. See -`./run-bench --help`. +The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See +`./bench/bench --help`. ## Dev Docs @@ -320,7 +313,7 @@ When digging your teeth into the codebase for the first time, the crate documentation can be a great resource. By default `rustdoc` will strip out all documentation of private crate members in an effort to help consumers of the crate focus on the *interface* -without having to concern themselves with the *implimentation*. +without having to concern themselves with the *implementation*. Normally this is a great thing, but if you want to start hacking on regex internals it is not what you want. Many of the private members of this crate are well documented with rustdoc style comments, and @@ -328,7 +321,7 @@ it would be a shame to miss out on the opportunity that presents. You can generate the private docs with: ``` -> rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments +$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments ``` Then just point your browser at `target/doc/regex/index.html`. diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 6b143262d4..b4aeb89c1b 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -2,7 +2,7 @@ Your friendly guide to understanding the performance characteristics of this crate. This guide assumes some familiarity with the public API of this crate, which -can be found here: http://doc.rust-lang.org/regex/regex/index.html +can be found here: https://docs.rs/regex ## Theory vs. Practice diff --git a/README.md b/README.md index ebffe39d2c..b4b6501886 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,13 @@ by [RE2](https://github.com/google/re2). ### Documentation -[Module documentation with examples](https://doc.rust-lang.org/regex). +[Module documentation with examples](https://docs.rs/regex). The module documentation also include a comprehensive description of the syntax supported. Documentation with examples for the various matching functions and iterators can be found on the -[`Regex` type](https://doc.rust-lang.org/regex/regex/struct.Regex.html). +[`Regex` type](https://docs.rs/regex/*/regex/struct.Regex.html). ### Usage @@ -188,37 +188,6 @@ assert!(!matches.matched(5)); assert!(matches.matched(6)); ``` -### Usage: `regex!` compiler plugin - -**WARNING**: The `regex!` compiler plugin is orders of magnitude slower than -the normal `Regex::new(...)` usage. You should not use the compiler plugin -unless you have a very special reason for doing so. The performance difference -may be the temporary, but the path forward at this point isn't clear. - -The `regex!` compiler plugin will compile your regexes at compile time. **This -only works with a nightly compiler.** - -Here is a small example: - -```rust -#![feature(plugin)] - -#![plugin(regex_macros)] -extern crate regex; - -fn main() { - let re = regex!(r"(\d{4})-(\d{2})-(\d{2})"); - let caps = re.captures("2010-03-14").unwrap(); - - assert_eq!("2010", caps[1]); - assert_eq!("03", caps[2]); - assert_eq!("14", caps[3]); -} -``` - -Notice that we never `unwrap` the result of `regex!`. This is because your -*program* won't compile if the regex doesn't compile. (Try `regex!("(")`.) - ### Usage: a regular expression parser @@ -228,8 +197,7 @@ execution. This may be useful if you're implementing your own regex engine or otherwise need to do analysis on the syntax of a regular expression. It is otherwise not recommended for general use. -[Documentation for `regex-syntax` with -examples](https://doc.rust-lang.org/regex/regex_syntax/index.html). +[Documentation for `regex-syntax` with examples](https://docs.rs/regex-syntax). # License diff --git a/appveyor.yml b/appveyor.yml index 0933ad8cc3..c67ab0efda 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,8 +10,10 @@ install: - SET PATH=%PATH%;C:\MinGW\bin - rustc -V - cargo -V - build: false - test_script: - cargo test --verbose --jobs 4 +branches: + only: + - master + - auto diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 01544d2525..045de2f868 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -5,26 +5,27 @@ version = "0.1.0" authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" repository = "https://github.com/rust-lang/regex" -documentation = "http://doc.rust-lang.org/regex/regex/index.html" +documentation = "https://docs.rs/regex" homepage = "https://github.com/rust-lang/regex" description = "Regex benchmarks for Rust's and other engines." build = "build.rs" +workspace = ".." [dependencies] -docopt = "0.6" -lazy_static = "0.1" +docopt = "0.8" +lazy_static = "1" libc = "0.2" -onig = { version = "1.2", optional = true } +onig = { version = "3", optional = true } libpcre-sys = { version = "0.2", optional = true } memmap = "0.2" regex = { version = "0.2.0", path = "..", features = ["simd-accel"] } -regex_macros = { version = "0.2.0", path = "../regex_macros", optional = true } regex-syntax = { version = "0.4.0", path = "../regex-syntax" } -rustc-serialize = "0.3" +serde = "1" +serde_derive = "1" [build-dependencies] -gcc = "0.3" -pkg-config = "0.3" +cc = "1" +pkg-config = "0.3.9" [[bin]] name = "regex-run-one" @@ -40,8 +41,7 @@ bench = false # Doing anything else will probably result in weird "duplicate definition" # compiler errors. # -# Tip: use the run-bench script in the root of this repository to run -# benchmarks. +# Tip: use the `bench/run` script (in this directory) to run benchmarks. [features] re-pcre1 = ["libpcre-sys"] re-pcre2 = [] @@ -49,7 +49,6 @@ re-onig = ["onig"] re-re2 = [] re-rust = [] re-rust-bytes = [] -re-rust-plugin = ["regex_macros"] re-tcl = [] [[bench]] @@ -57,12 +56,3 @@ name = "bench" path = "src/bench.rs" test = false bench = true - -[profile.release] -debug = true - -[profile.bench] -debug = true - -[profile.test] -debug = true diff --git a/bench/build.rs b/bench/build.rs index 628c2b25d6..21e5714b4a 100644 --- a/bench/build.rs +++ b/bench/build.rs @@ -8,51 +8,26 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -extern crate gcc; +extern crate cc; extern crate pkg_config; use std::env; -use std::process; - -macro_rules! we { - ($($tt:tt)*) => {{ - use std::io::Write; - writeln!(&mut ::std::io::stderr(), $($tt),*).unwrap(); - }} -} fn main() { - // We only need to look for PCRE2 and RE2 because we roll the FFI bindings - // for those libraries ourselves from scratch. For PCRE1 and Oniguruma, we - // rely on other crates that do something similar to the dance below for - // us. - - let wants_pcre2 = env::var("CARGO_FEATURE_RE_PCRE2").is_ok(); - let has_pcre2 = pkg_config::Config::new().find("libpcre2-8").is_ok(); - if wants_pcre2 && !has_pcre2 { - we!("pcre2 cannot be found by pkg-config"); - process::exit(1); + if env::var("CARGO_FEATURE_RE_PCRE2").is_ok() { + pkg_config::probe_library("libpcre2-8").unwrap(); } - - let wants_re2 = env::var("CARGO_FEATURE_RE_RE2").is_ok(); - let has_re2 = pkg_config::Config::new().find("re2").is_ok(); - if wants_re2 { - if !has_re2 { - we!("re2 cannot be found by pkg-config"); - process::exit(1); - } - gcc::Config::new() + if env::var("CARGO_FEATURE_RE_RE2").is_ok() { + // RE2 is a C++ library, so we need to compile our shim layer. + cc::Build::new() .cpp(true) - .flag("-std=c++11") .file("src/ffi/re2.cpp") .compile("libcre2.a"); - println!("cargo:rustc-link-lib=re2"); + // It's important this comes after compiling the shim, which results + // in the correct order of arguments given to the linker. + pkg_config::probe_library("re2").unwrap(); } - - let wants_tcl = env::var("CARGO_FEATURE_RE_TCL").is_ok(); - let has_tcl = pkg_config::Config::new().find("tcl").is_ok(); - if wants_tcl && !has_tcl { - we!("tcl cannot be found by pkg-config"); - process::exit(1); + if env::var("CARGO_FEATURE_RE_TCL").is_ok() { + pkg_config::probe_library("tcl").unwrap(); } } diff --git a/bench/run b/bench/run index 1d0321c1db..1147f4bb74 100755 --- a/bench/run +++ b/bench/run @@ -1,7 +1,7 @@ #!/bin/bash usage() { - echo "Usage: $(basename $0) [rust | rust-bytes | rust-plugin | pcre1 | pcre2 | re2 | onig | tcl ]" >&2 + echo "Usage: $(basename $0) [rust | rust-bytes | pcre1 | pcre2 | re2 | onig | tcl ]" >&2 exit 1 } @@ -22,11 +22,8 @@ case $which in rust-bytes) exec cargo bench --bench bench --features re-rust-bytes "$@" ;; - rust-plugin) - exec cargo bench --bench bench --features re-rust-plugin "$@" - ;; re2) - exec cargo bench --bench bench --features re-re2 "$@" + exec cargo bench --verbose --bench bench --features re-re2 "$@" ;; pcre1) exec cargo bench --bench bench --features re-pcre1 "$@" diff --git a/bench/src/bench.rs b/bench/src/bench.rs index a45079edc0..319ea5f7a8 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -11,11 +11,6 @@ // Enable the benchmarking harness. #![feature(test)] -// If we're benchmarking the Rust regex plugin, then pull that in. -// This will bring a `regex!` macro into scope. -#![cfg_attr(feature = "re-rust-plugin", feature(plugin))] -#![cfg_attr(feature = "re-rust-plugin", plugin(regex_macros))] - #[macro_use] extern crate lazy_static; #[cfg(not(any(feature = "re-rust", feature = "re-rust-bytes")))] @@ -27,7 +22,6 @@ extern crate onig; #[cfg(any( feature = "re-rust", feature = "re-rust-bytes", - feature = "re-rust-plugin", ))] extern crate regex; #[cfg(feature = "re-rust")] @@ -43,7 +37,7 @@ pub use ffi::pcre1::Regex; pub use ffi::pcre2::Regex; #[cfg(feature = "re-re2")] pub use ffi::re2::Regex; -#[cfg(any(feature = "re-rust", feature = "re-rust-plugin"))] +#[cfg(feature = "re-rust")] pub use regex::Regex; #[cfg(feature = "re-rust-bytes")] pub use regex::bytes::Regex; @@ -52,14 +46,11 @@ pub use ffi::tcl::Regex; // Usage: regex!(pattern) // -// Builds a ::Regex from a borrowed string. This is used in every regex -// engine except for the Rust plugin, because the plugin itself defines the -// same macro. +// Builds a ::Regex from a borrowed string. // // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. -#[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { ($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() } } @@ -99,7 +90,6 @@ macro_rules! text { feature = "re-pcre2", feature = "re-re2", feature = "re-rust", - feature = "re-rust-plugin", ))] macro_rules! text { ($text:expr) => { $text } @@ -116,7 +106,6 @@ type Text = Vec; feature = "re-pcre2", feature = "re-re2", feature = "re-rust", - feature = "re-rust-plugin", ))] type Text = String; @@ -236,6 +225,41 @@ macro_rules! bench_find { } } +// USAGE: bench_captures!(name, pattern, groups, haystack); +// +// CONTRACT: +// Given: +// ident, the desired benchmarking function name +// pattern : ::Regex, the regular expression to be executed +// groups : usize, the number of capture groups +// haystack : String, the string to search +// bench_captures will benchmark how fast re.captures() produces +// the capture groups in question. +macro_rules! bench_captures { + ($name:ident, $pattern:expr, $count:expr, $haystack:expr) => { + + #[cfg(feature = "re-rust")] + #[bench] + fn $name(b: &mut Bencher) { + use std::sync::Mutex; + + lazy_static! { + static ref RE: Mutex = Mutex::new($pattern); + static ref TEXT: Mutex = Mutex::new(text!($haystack)); + }; + let re = RE.lock().unwrap(); + let text = TEXT.lock().unwrap(); + b.bytes = text.len() as u64; + b.iter(|| { + match re.captures(&text) { + None => assert!(false, "no captures"), + Some(caps) => assert_eq!($count + 1, caps.len()), + } + }); + } + } +} + mod ffi; mod misc; mod regexdna; diff --git a/bench/src/ffi/onig.rs b/bench/src/ffi/onig.rs index 7d796d6178..258059cd32 100644 --- a/bench/src/ffi/onig.rs +++ b/bench/src/ffi/onig.rs @@ -22,7 +22,12 @@ impl Regex { pub fn is_match(&self, text: &str) -> bool { // Gah. onig's is_match function is anchored, but find is not. self.0.search_with_options( - text, 0, text.len(), onig::SEARCH_OPTION_NONE, None).is_some() + text, + 0, + text.len(), + onig::SearchOptions::SEARCH_OPTION_NONE, + None, + ).is_some() } pub fn find_iter<'r, 't>( diff --git a/bench/src/ffi/re2.cpp b/bench/src/ffi/re2.cpp index cc75b87bcd..2d7109d1a9 100644 --- a/bench/src/ffi/re2.cpp +++ b/bench/src/ffi/re2.cpp @@ -1,7 +1,7 @@ #include #include -#include "re2.h" +#include "re2/re2.h" using namespace re2; diff --git a/bench/src/ffi/re2.h b/bench/src/ffi/re2.h deleted file mode 100644 index 377a04d700..0000000000 --- a/bench/src/ffi/re2.h +++ /dev/null @@ -1,934 +0,0 @@ -// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_RE2_H_ -#define RE2_RE2_H_ - -// C++ interface to the re2 regular-expression library. -// RE2 supports Perl-style regular expressions (with extensions like -// \d, \w, \s, ...). -// -// ----------------------------------------------------------------------- -// REGEXP SYNTAX: -// -// This module uses the re2 library and hence supports -// its syntax for regular expressions, which is similar to Perl's with -// some of the more complicated things thrown away. In particular, -// backreferences and generalized assertions are not available, nor is \Z. -// -// See https://github.com/google/re2/wiki/Syntax for the syntax -// supported by RE2, and a comparison with PCRE and PERL regexps. -// -// For those not familiar with Perl's regular expressions, -// here are some examples of the most commonly used extensions: -// -// "hello (\\w+) world" -- \w matches a "word" character -// "version (\\d+)" -- \d matches a digit -// "hello\\s+world" -- \s matches any whitespace character -// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary -// "(?i)hello" -- (?i) turns on case-insensitive matching -// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible -// -// ----------------------------------------------------------------------- -// MATCHING INTERFACE: -// -// The "FullMatch" operation checks that supplied text matches a -// supplied pattern exactly. -// -// Example: successful match -// CHECK(RE2::FullMatch("hello", "h.*o")); -// -// Example: unsuccessful match (requires full match): -// CHECK(!RE2::FullMatch("hello", "e")); -// -// ----------------------------------------------------------------------- -// UTF-8 AND THE MATCHING INTERFACE: -// -// By default, the pattern and input text are interpreted as UTF-8. -// The RE2::Latin1 option causes them to be interpreted as Latin-1. -// -// Example: -// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern))); -// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); -// -// ----------------------------------------------------------------------- -// MATCHING WITH SUB-STRING EXTRACTION: -// -// You can supply extra pointer arguments to extract matched subpieces. -// -// Example: extracts "ruby" into "s" and 1234 into "i" -// int i; -// string s; -// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); -// -// Example: fails because string cannot be stored in integer -// CHECK(!RE2::FullMatch("ruby", "(.*)", &i)); -// -// Example: fails because there aren't enough sub-patterns: -// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s)); -// -// Example: does not try to extract any extra sub-patterns -// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); -// -// Example: does not try to extract into NULL -// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); -// -// Example: integer overflow causes failure -// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); -// -// NOTE(rsc): Asking for substrings slows successful matches quite a bit. -// This may get a little faster in the future, but right now is slower -// than PCRE. On the other hand, failed matches run *very* fast (faster -// than PCRE), as do matches without substring extraction. -// -// ----------------------------------------------------------------------- -// PARTIAL MATCHES -// -// You can use the "PartialMatch" operation when you want the pattern -// to match any substring of the text. -// -// Example: simple search for a string: -// CHECK(RE2::PartialMatch("hello", "ell")); -// -// Example: find first number in a string -// int number; -// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number)); -// CHECK_EQ(number, 100); -// -// ----------------------------------------------------------------------- -// PRE-COMPILED REGULAR EXPRESSIONS -// -// RE2 makes it easy to use any string as a regular expression, without -// requiring a separate compilation step. -// -// If speed is of the essence, you can create a pre-compiled "RE2" -// object from the pattern and use it multiple times. If you do so, -// you can typically parse text faster than with sscanf. -// -// Example: precompile pattern for faster matching: -// RE2 pattern("h.*o"); -// while (ReadLine(&str)) { -// if (RE2::FullMatch(str, pattern)) ...; -// } -// -// ----------------------------------------------------------------------- -// SCANNING TEXT INCREMENTALLY -// -// The "Consume" operation may be useful if you want to repeatedly -// match regular expressions at the front of a string and skip over -// them as they match. This requires use of the "StringPiece" type, -// which represents a sub-range of a real string. -// -// Example: read lines of the form "var = value" from a string. -// string contents = ...; // Fill string somehow -// StringPiece input(contents); // Wrap a StringPiece around it -// -// string var; -// int value; -// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { -// ...; -// } -// -// Each successful call to "Consume" will set "var/value", and also -// advance "input" so it points past the matched text. Note that if the -// regular expression matches an empty string, input will advance -// by 0 bytes. If the regular expression being used might match -// an empty string, the loop body must check for this case and either -// advance the string or break out of the loop. -// -// The "FindAndConsume" operation is similar to "Consume" but does not -// anchor your match at the beginning of the string. For example, you -// could extract all words from a string by repeatedly calling -// RE2::FindAndConsume(&input, "(\\w+)", &word) -// -// ----------------------------------------------------------------------- -// USING VARIABLE NUMBER OF ARGUMENTS -// -// The above operations require you to know the number of arguments -// when you write the code. This is not always possible or easy (for -// example, the regular expression may be calculated at run time). -// You can use the "N" version of the operations when the number of -// match arguments are determined at run time. -// -// Example: -// const RE2::Arg* args[10]; -// int n; -// // ... populate args with pointers to RE2::Arg values ... -// // ... set n to the number of RE2::Arg objects ... -// bool match = RE2::FullMatchN(input, pattern, args, n); -// -// The last statement is equivalent to -// -// bool match = RE2::FullMatch(input, pattern, -// *args[0], *args[1], ..., *args[n - 1]); -// -// ----------------------------------------------------------------------- -// PARSING HEX/OCTAL/C-RADIX NUMBERS -// -// By default, if you pass a pointer to a numeric value, the -// corresponding text is interpreted as a base-10 number. You can -// instead wrap the pointer with a call to one of the operators Hex(), -// Octal(), or CRadix() to interpret the text in another base. The -// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) -// prefixes, but defaults to base-10. -// -// Example: -// int a, b, c, d; -// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", -// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d)); -// will leave 64 in a, b, c, and d. - -#include -#include -#include -#include -#include -#include -#include - -#include "re2/stringpiece.h" - -namespace re2 { -class Prog; -class Regexp; -} // namespace re2 - -namespace re2 { - -// TODO(junyer): Get rid of this. -using std::string; - -// Interface for regular expression matching. Also corresponds to a -// pre-compiled regular expression. An "RE2" object is safe for -// concurrent use by multiple threads. -class RE2 { - public: - // We convert user-passed pointers into special Arg objects - class Arg; - class Options; - - // Defined in set.h. - class Set; - - enum ErrorCode { - NoError = 0, - - // Unexpected error - ErrorInternal, - - // Parse errors - ErrorBadEscape, // bad escape sequence - ErrorBadCharClass, // bad character class - ErrorBadCharRange, // bad character class range - ErrorMissingBracket, // missing closing ] - ErrorMissingParen, // missing closing ) - ErrorTrailingBackslash, // trailing \ at end of regexp - ErrorRepeatArgument, // repeat argument missing, e.g. "*" - ErrorRepeatSize, // bad repetition argument - ErrorRepeatOp, // bad repetition operator - ErrorBadPerlOp, // bad perl operator - ErrorBadUTF8, // invalid UTF-8 in regexp - ErrorBadNamedCapture, // bad named capture group - ErrorPatternTooLarge // pattern too large (compile failed) - }; - - // Predefined common options. - // If you need more complicated things, instantiate - // an Option class, possibly passing one of these to - // the Option constructor, change the settings, and pass that - // Option class to the RE2 constructor. - enum CannedOptions { - DefaultOptions = 0, - Latin1, // treat input as Latin-1 (default UTF-8) - POSIX, // POSIX syntax, leftmost-longest match - Quiet // do not log about regexp parse errors - }; - - // Need to have the const char* and const string& forms for implicit - // conversions when passing string literals to FullMatch and PartialMatch. - // Otherwise the StringPiece form would be sufficient. -#ifndef SWIG - RE2(const char* pattern); - RE2(const string& pattern); -#endif - RE2(const StringPiece& pattern); - RE2(const StringPiece& pattern, const Options& options); - ~RE2(); - - // Returns whether RE2 was created properly. - bool ok() const { return error_code() == NoError; } - - // The string specification for this RE2. E.g. - // RE2 re("ab*c?d+"); - // re.pattern(); // "ab*c?d+" - const string& pattern() const { return pattern_; } - - // If RE2 could not be created properly, returns an error string. - // Else returns the empty string. - const string& error() const { return *error_; } - - // If RE2 could not be created properly, returns an error code. - // Else returns RE2::NoError (== 0). - ErrorCode error_code() const { return error_code_; } - - // If RE2 could not be created properly, returns the offending - // portion of the regexp. - const string& error_arg() const { return error_arg_; } - - // Returns the program size, a very approximate measure of a regexp's "cost". - // Larger numbers are more expensive than smaller numbers. - int ProgramSize() const; - - // EXPERIMENTAL! SUBJECT TO CHANGE! - // Outputs the program fanout as a histogram bucketed by powers of 2. - // Returns the number of the largest non-empty bucket. - int ProgramFanout(std::map* histogram) const; - - // Returns the underlying Regexp; not for general use. - // Returns entire_regexp_ so that callers don't need - // to know about prefix_ and prefix_foldcase_. - re2::Regexp* Regexp() const { return entire_regexp_; } - - /***** The useful part: the matching interface *****/ - - // Matches "text" against "re". If pointer arguments are - // supplied, copies matched sub-patterns into them. - // - // You can pass in a "const char*" or a "string" for "text". - // You can pass in a "const char*" or a "string" or a "RE2" for "re". - // - // The provided pointer arguments can be pointers to any scalar numeric - // type, or one of: - // string (matched piece is copied to string) - // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, size_t)" exists) - // (void*)NULL (the corresponding matched sub-pattern is not copied) - // - // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "re" exactly - // b. The number of matched sub-patterns is >= number of supplied pointers - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, "i"th captured sub-pattern is - // ignored. - // - // CAVEAT: An optional sub-pattern that does not exist in the - // matched string is assigned the empty string. Therefore, the - // following will return false (because the empty string is not a - // valid number): - // int number; - // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); - static bool FullMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int argc); - - // Exactly like FullMatch(), except that "re" is allowed to match - // a substring of "text". - static bool PartialMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int argc); - - // Like FullMatch() and PartialMatch(), except that "re" has to match - // a prefix of the text, and "input" is advanced past the matched - // text. Note: "input" is modified iff this routine returns true. - static bool ConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int argc); - - // Like Consume(), but does not anchor the match at the beginning of - // the text. That is, "re" need not start its match at the beginning - // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds - // the next word in "s" and stores it in "word". - static bool FindAndConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int argc); - -#ifndef SWIG - private: - template - static inline bool Apply(F f, SP sp, const RE2& re) { - return f(sp, re, NULL, 0); - } - - template - static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) { - const Arg* const args[] = {&a...}; - const int argc = sizeof...(a); - return f(sp, re, args, argc); - } - - public: - // In order to allow FullMatch() et al. to be called with a varying number - // of arguments of varying types, we use two layers of variadic templates. - // The first layer constructs the temporary Arg objects. The second layer - // (above) constructs the array of pointers to the temporary Arg objects. - - template - static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) { - return Apply(FullMatchN, text, re, Arg(std::forward(a))...); - } - - template - static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { - return Apply(PartialMatchN, text, re, Arg(std::forward(a))...); - } - - template - static bool Consume(StringPiece* input, const RE2& re, A&&... a) { - return Apply(ConsumeN, input, re, Arg(std::forward(a))...); - } - - template - static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { - return Apply(FindAndConsumeN, input, re, Arg(std::forward(a))...); - } -#endif - - // Replace the first match of "pattern" in "str" with "rewrite". - // Within "rewrite", backslash-escaped digits (\1 to \9) can be - // used to insert text matching corresponding parenthesized group - // from the pattern. \0 in "rewrite" refers to the entire matching - // text. E.g., - // - // string s = "yabba dabba doo"; - // CHECK(RE2::Replace(&s, "b+", "d")); - // - // will leave "s" containing "yada dabba doo" - // - // Returns true if the pattern matches and a replacement occurs, - // false otherwise. - static bool Replace(string *str, - const RE2& pattern, - const StringPiece& rewrite); - - // Like Replace(), except replaces successive non-overlapping occurrences - // of the pattern in the string with the rewrite. E.g. - // - // string s = "yabba dabba doo"; - // CHECK(RE2::GlobalReplace(&s, "b+", "d")); - // - // will leave "s" containing "yada dada doo" - // Replacements are not subject to re-matching. - // - // Because GlobalReplace only replaces non-overlapping matches, - // replacing "ana" within "banana" makes only one replacement, not two. - // - // Returns the number of replacements made. - static int GlobalReplace(string *str, - const RE2& pattern, - const StringPiece& rewrite); - - // Like Replace, except that if the pattern matches, "rewrite" - // is copied into "out" with substitutions. The non-matching - // portions of "text" are ignored. - // - // Returns true iff a match occurred and the extraction happened - // successfully; if no match occurs, the string is left unaffected. - // - // REQUIRES: "text" must not alias any part of "*out". - static bool Extract(const StringPiece &text, - const RE2& pattern, - const StringPiece &rewrite, - string *out); - - // Escapes all potentially meaningful regexp characters in - // 'unquoted'. The returned string, used as a regular expression, - // will exactly match the original string. For example, - // 1.5-2.0? - // may become: - // 1\.5\-2\.0\? - static string QuoteMeta(const StringPiece& unquoted); - - // Computes range for any strings matching regexp. The min and max can in - // some cases be arbitrarily precise, so the caller gets to specify the - // maximum desired length of string returned. - // - // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any - // string s that is an anchored match for this regexp satisfies - // min <= s && s <= max. - // - // Note that PossibleMatchRange() will only consider the first copy of an - // infinitely repeated element (i.e., any regexp element followed by a '*' or - // '+' operator). Regexps with "{N}" constructions are not affected, as those - // do not compile down to infinite repetitions. - // - // Returns true on success, false on error. - bool PossibleMatchRange(string* min, string* max, int maxlen) const; - - // Generic matching interface - - // Type of match. - enum Anchor { - UNANCHORED, // No anchoring - ANCHOR_START, // Anchor at start only - ANCHOR_BOTH // Anchor at start and end - }; - - // Return the number of capturing subpatterns, or -1 if the - // regexp wasn't valid on construction. The overall match ($0) - // does not count: if the regexp is "(a)(b)", returns 2. - int NumberOfCapturingGroups() const; - - // Return a map from names to capturing indices. - // The map records the index of the leftmost group - // with the given name. - // Only valid until the re is deleted. - const std::map& NamedCapturingGroups() const; - - // Return a map from capturing indices to names. - // The map has no entries for unnamed groups. - // Only valid until the re is deleted. - const std::map& CapturingGroupNames() const; - - // General matching routine. - // Match against text starting at offset startpos - // and stopping the search at offset endpos. - // Returns true if match found, false if not. - // On a successful match, fills in match[] (up to nmatch entries) - // with information about submatches. - // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, - // setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar", - // match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL. - // - // Don't ask for more match information than you will use: - // runs much faster with nmatch == 1 than nmatch > 1, and - // runs even faster if nmatch == 0. - // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(), - // but will be handled correctly. - // - // Passing text == StringPiece(NULL, 0) will be handled like any other - // empty string, but note that on return, it will not be possible to tell - // whether submatch i matched the empty string or did not match: - // either way, match[i].data() == NULL. - bool Match(const StringPiece& text, - size_t startpos, - size_t endpos, - Anchor anchor, - StringPiece *match, - int nmatch) const; - - // Check that the given rewrite string is suitable for use with this - // regular expression. It checks that: - // * The regular expression has enough parenthesized subexpressions - // to satisfy all of the \N tokens in rewrite - // * The rewrite string doesn't have any syntax errors. E.g., - // '\' followed by anything other than a digit or '\'. - // A true return value guarantees that Replace() and Extract() won't - // fail because of a bad rewrite string. - bool CheckRewriteString(const StringPiece& rewrite, string* error) const; - - // Returns the maximum submatch needed for the rewrite to be done by - // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. - static int MaxSubmatch(const StringPiece& rewrite); - - // Append the "rewrite" string, with backslash subsitutions from "vec", - // to string "out". - // Returns true on success. This method can fail because of a malformed - // rewrite string. CheckRewriteString guarantees that the rewrite will - // be sucessful. - bool Rewrite(string *out, - const StringPiece &rewrite, - const StringPiece* vec, - int veclen) const; - - // Constructor options - class Options { - public: - // The options are (defaults in parentheses): - // - // utf8 (true) text and pattern are UTF-8; otherwise Latin-1 - // posix_syntax (false) restrict regexps to POSIX egrep syntax - // longest_match (false) search for longest match, not first match - // log_errors (true) log syntax and execution errors to ERROR - // max_mem (see below) approx. max memory footprint of RE2 - // literal (false) interpret string as literal, not regexp - // never_nl (false) never match \n, even if it is in regexp - // dot_nl (false) dot matches everything including new line - // never_capture (false) parse all parens as non-capturing - // case_sensitive (true) match is case-sensitive (regexp can override - // with (?i) unless in posix_syntax mode) - // - // The following options are only consulted when posix_syntax == true. - // (When posix_syntax == false these features are always enabled and - // cannot be turned off.) - // perl_classes (false) allow Perl's \d \s \w \D \S \W - // word_boundary (false) allow Perl's \b \B (word boundary and not) - // one_line (false) ^ and $ only match beginning and end of text - // - // The max_mem option controls how much memory can be used - // to hold the compiled form of the regexp (the Prog) and - // its cached DFA graphs. Code Search placed limits on the number - // of Prog instructions and DFA states: 10,000 for both. - // In RE2, those limits would translate to about 240 KB per Prog - // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a - // better job of keeping them small than Code Search did). - // Each RE2 has two Progs (one forward, one reverse), and each Prog - // can have two DFAs (one first match, one longest match). - // That makes 4 DFAs: - // - // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches - // if opt.longest_match() == false - // forward, longest-match - used for all ANCHOR_BOTH searches, - // and the other two kinds if - // opt.longest_match() == true - // reverse, first-match - never used - // reverse, longest-match - used as second phase for unanchored searches - // - // The RE2 memory budget is statically divided between the two - // Progs and then the DFAs: two thirds to the forward Prog - // and one third to the reverse Prog. The forward Prog gives half - // of what it has left over to each of its DFAs. The reverse Prog - // gives it all to its longest-match DFA. - // - // Once a DFA fills its budget, it flushes its cache and starts over. - // If this happens too often, RE2 falls back on the NFA implementation. - - // For now, make the default budget something close to Code Search. - static const int kDefaultMaxMem = 8<<20; - - enum Encoding { - EncodingUTF8 = 1, - EncodingLatin1 - }; - - Options() : - encoding_(EncodingUTF8), - posix_syntax_(false), - longest_match_(false), - log_errors_(true), - max_mem_(kDefaultMaxMem), - literal_(false), - never_nl_(false), - dot_nl_(false), - never_capture_(false), - case_sensitive_(true), - perl_classes_(false), - word_boundary_(false), - one_line_(false) { - } - - /*implicit*/ Options(CannedOptions); - - Encoding encoding() const { return encoding_; } - void set_encoding(Encoding encoding) { encoding_ = encoding; } - - // Legacy interface to encoding. - // TODO(rsc): Remove once clients have been converted. - bool utf8() const { return encoding_ == EncodingUTF8; } - void set_utf8(bool b) { - if (b) { - encoding_ = EncodingUTF8; - } else { - encoding_ = EncodingLatin1; - } - } - - bool posix_syntax() const { return posix_syntax_; } - void set_posix_syntax(bool b) { posix_syntax_ = b; } - - bool longest_match() const { return longest_match_; } - void set_longest_match(bool b) { longest_match_ = b; } - - bool log_errors() const { return log_errors_; } - void set_log_errors(bool b) { log_errors_ = b; } - - int64_t max_mem() const { return max_mem_; } - void set_max_mem(int64_t m) { max_mem_ = m; } - - bool literal() const { return literal_; } - void set_literal(bool b) { literal_ = b; } - - bool never_nl() const { return never_nl_; } - void set_never_nl(bool b) { never_nl_ = b; } - - bool dot_nl() const { return dot_nl_; } - void set_dot_nl(bool b) { dot_nl_ = b; } - - bool never_capture() const { return never_capture_; } - void set_never_capture(bool b) { never_capture_ = b; } - - bool case_sensitive() const { return case_sensitive_; } - void set_case_sensitive(bool b) { case_sensitive_ = b; } - - bool perl_classes() const { return perl_classes_; } - void set_perl_classes(bool b) { perl_classes_ = b; } - - bool word_boundary() const { return word_boundary_; } - void set_word_boundary(bool b) { word_boundary_ = b; } - - bool one_line() const { return one_line_; } - void set_one_line(bool b) { one_line_ = b; } - - void Copy(const Options& src) { - *this = src; - } - - int ParseFlags() const; - - private: - Encoding encoding_; - bool posix_syntax_; - bool longest_match_; - bool log_errors_; - int64_t max_mem_; - bool literal_; - bool never_nl_; - bool dot_nl_; - bool never_capture_; - bool case_sensitive_; - bool perl_classes_; - bool word_boundary_; - bool one_line_; - }; - - // Returns the options set in the constructor. - const Options& options() const { return options_; }; - - // Argument converters; see below. - static inline Arg CRadix(short* x); - static inline Arg CRadix(unsigned short* x); - static inline Arg CRadix(int* x); - static inline Arg CRadix(unsigned int* x); - static inline Arg CRadix(long* x); - static inline Arg CRadix(unsigned long* x); - static inline Arg CRadix(long long* x); - static inline Arg CRadix(unsigned long long* x); - - static inline Arg Hex(short* x); - static inline Arg Hex(unsigned short* x); - static inline Arg Hex(int* x); - static inline Arg Hex(unsigned int* x); - static inline Arg Hex(long* x); - static inline Arg Hex(unsigned long* x); - static inline Arg Hex(long long* x); - static inline Arg Hex(unsigned long long* x); - - static inline Arg Octal(short* x); - static inline Arg Octal(unsigned short* x); - static inline Arg Octal(int* x); - static inline Arg Octal(unsigned int* x); - static inline Arg Octal(long* x); - static inline Arg Octal(unsigned long* x); - static inline Arg Octal(long long* x); - static inline Arg Octal(unsigned long long* x); - - private: - void Init(const StringPiece& pattern, const Options& options); - - bool DoMatch(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const args[], - int n) const; - - re2::Prog* ReverseProg() const; - - string pattern_; // string regular expression - Options options_; // option flags - string prefix_; // required prefix (before regexp_) - bool prefix_foldcase_; // prefix is ASCII case-insensitive - re2::Regexp* entire_regexp_; // parsed regular expression - re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed - re2::Prog* prog_; // compiled program for regexp - bool is_one_pass_; // can use prog_->SearchOnePass? - - mutable re2::Prog* rprog_; // reverse program for regexp - mutable const string* error_; // Error indicator - // (or points to empty string) - mutable ErrorCode error_code_; // Error code - mutable string error_arg_; // Fragment of regexp showing error - mutable int num_captures_; // Number of capturing groups - - // Map from capture names to indices - mutable const std::map* named_groups_; - - // Map from capture indices to names - mutable const std::map* group_names_; - - // Onces for lazy computations. - mutable std::once_flag rprog_once_; - mutable std::once_flag num_captures_once_; - mutable std::once_flag named_groups_once_; - mutable std::once_flag group_names_once_; - - RE2(const RE2&) = delete; - RE2& operator=(const RE2&) = delete; -}; - -/***** Implementation details *****/ - -// Hex/Octal/Binary? - -// Special class for parsing into objects that define a ParseFrom() method -template -class _RE2_MatchObject { - public: - static inline bool Parse(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - T* object = reinterpret_cast(dest); - return object->ParseFrom(str, n); - } -}; - -class RE2::Arg { - public: - // Empty constructor so we can declare arrays of RE2::Arg - Arg(); - - // Constructor specially designed for NULL arguments - Arg(void*); - - typedef bool (*Parser)(const char* str, size_t n, void* dest); - -// Type-specific parsers -#define MAKE_PARSER(type, name) \ - Arg(type* p) : arg_(p), parser_(name) {} \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} - - MAKE_PARSER(char, parse_char); - MAKE_PARSER(signed char, parse_schar); - MAKE_PARSER(unsigned char, parse_uchar); - MAKE_PARSER(float, parse_float); - MAKE_PARSER(double, parse_double); - MAKE_PARSER(string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); - - MAKE_PARSER(short, parse_short); - MAKE_PARSER(unsigned short, parse_ushort); - MAKE_PARSER(int, parse_int); - MAKE_PARSER(unsigned int, parse_uint); - MAKE_PARSER(long, parse_long); - MAKE_PARSER(unsigned long, parse_ulong); - MAKE_PARSER(long long, parse_longlong); - MAKE_PARSER(unsigned long long, parse_ulonglong); - -#undef MAKE_PARSER - - // Generic constructor templates - template Arg(T* p) - : arg_(p), parser_(_RE2_MatchObject::Parse) { } - template Arg(T* p, Parser parser) - : arg_(p), parser_(parser) { } - - // Parse the data - bool Parse(const char* str, size_t n) const; - - private: - void* arg_; - Parser parser_; - - static bool parse_null (const char* str, size_t n, void* dest); - static bool parse_char (const char* str, size_t n, void* dest); - static bool parse_schar (const char* str, size_t n, void* dest); - static bool parse_uchar (const char* str, size_t n, void* dest); - static bool parse_float (const char* str, size_t n, void* dest); - static bool parse_double (const char* str, size_t n, void* dest); - static bool parse_string (const char* str, size_t n, void* dest); - static bool parse_stringpiece (const char* str, size_t n, void* dest); - -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_##name(const char* str, size_t n, void* dest); \ - static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ - int radix); \ - \ - public: \ - static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ - static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ - static bool parse_##name##_cradix(const char* str, size_t n, void* dest) - - DECLARE_INTEGER_PARSER(short); - DECLARE_INTEGER_PARSER(ushort); - DECLARE_INTEGER_PARSER(int); - DECLARE_INTEGER_PARSER(uint); - DECLARE_INTEGER_PARSER(long); - DECLARE_INTEGER_PARSER(ulong); - DECLARE_INTEGER_PARSER(longlong); - DECLARE_INTEGER_PARSER(ulonglong); - -#undef DECLARE_INTEGER_PARSER - -}; - -inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } -inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } - -inline bool RE2::Arg::Parse(const char* str, size_t n) const { - return (*parser_)(str, n, arg_); -} - -// This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline RE2::Arg RE2::Hex(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \ - } \ - inline RE2::Arg RE2::Octal(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \ - } \ - inline RE2::Arg RE2::CRadix(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \ - } - -MAKE_INTEGER_PARSER(short, short) -MAKE_INTEGER_PARSER(unsigned short, ushort) -MAKE_INTEGER_PARSER(int, int) -MAKE_INTEGER_PARSER(unsigned int, uint) -MAKE_INTEGER_PARSER(long, long) -MAKE_INTEGER_PARSER(unsigned long, ulong) -MAKE_INTEGER_PARSER(long long, longlong) -MAKE_INTEGER_PARSER(unsigned long long, ulonglong) - -#undef MAKE_INTEGER_PARSER - -#ifndef SWIG -// Helper for writing global or static RE2s safely. -// Write -// static LazyRE2 re = {".*"}; -// and then use *re instead of writing -// static RE2 re(".*"); -// The former is more careful about multithreaded -// situations than the latter. -// -// N.B. This class never deletes the RE2 object that -// it constructs: that's a feature, so that it can be used -// for global and function static variables. -class LazyRE2 { - private: - struct NoArg {}; - - public: - typedef RE2 element_type; // support std::pointer_traits - - // Constructor omitted to preserve braced initialization in C++98. - - // Pretend to be a pointer to Type (never NULL due to on-demand creation): - RE2& operator*() const { return *get(); } - RE2* operator->() const { return get(); } - - // Named accessor/initializer: - RE2* get() const { - std::call_once(once_, [this]() { LazyRE2::Init(this); }); - return ptr_; - } - - // All data fields must be public to support {"foo"} initialization. - const char* pattern_; - RE2::CannedOptions options_; - NoArg barrier_against_excess_initializers_; - - mutable RE2* ptr_; - mutable std::once_flag once_; - - private: - static void Init(const LazyRE2* lazy_re2) { - lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_); - } - - void operator=(const LazyRE2&); // disallowed -}; -#endif // SWIG - -} // namespace re2 - -using re2::RE2; -using re2::LazyRE2; - -#endif // RE2_RE2_H_ diff --git a/bench/src/main.rs b/bench/src/main.rs index 7b9abb437f..97fa2440a4 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -18,12 +18,13 @@ extern crate onig; #[cfg(any( feature = "re-rust", feature = "re-rust-bytes", - feature = "re-rust-plugin", ))] extern crate regex; #[cfg(feature = "re-rust")] extern crate regex_syntax; -extern crate rustc_serialize; +extern crate serde; +#[macro_use] +extern crate serde_derive; use std::str; @@ -50,7 +51,7 @@ Options: -h, --help Show this usage message. "; -#[derive(Debug, RustcDecodable)] +#[derive(Debug, Deserialize)] struct Args { arg_pattern: String, arg_file: String, @@ -64,8 +65,9 @@ struct Args { } fn main() { - let args: Args = Docopt::new(USAGE).and_then(|d| d.decode()) - .unwrap_or_else(|e| e.exit()); + let args: Args = Docopt::new(USAGE) + .and_then(|d| d.deserialize()) + .unwrap_or_else(|e| e.exit()); let mmap = Mmap::open_path(&args.arg_file, Protection::Read).unwrap(); let haystack = unsafe { str::from_utf8_unchecked(mmap.as_slice()) }; diff --git a/bench/src/misc.rs b/bench/src/misc.rs index 86f93c4878..08d07a30ad 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -19,7 +19,6 @@ use {Regex, Text}; #[cfg(not(feature = "re-onig"))] #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] -#[cfg(not(feature = "re-rust-plugin"))] bench_match!(no_exponential, { format!( "{}{}", @@ -191,3 +190,85 @@ macro_rules! reallyhard2 { () => (r"\w+\s+Holmes") } bench_match!(reallyhard2_1K, reallyhard2!(), get_text(TXT_1K, reallyhard2_suffix())); + + +// +// Benchmarks to justify the short-haystack NFA fallthrough optimization +// implemented by `read_captures_at` in regex/src/exec.rs. See github issue +// #348. +// +// The procedure used to try to determine the right hardcoded cutoff +// for the short-haystack optimization in issue #348 is as follows. +// +// ``` +// > cd bench +// > cargo bench --features re-rust short_hay | tee dfa-nfa.res +// > # modify the `MatchType::Dfa` branch in exec.rs:read_captures_at +// > # to just execute the nfa +// > cargo bench --features re-rust short_hay | tee nfa-only.res +// > cargo benchcmp dfa-nfa.res nfa-only.res +// ``` +// +// The expected result is that short inputs will go faster under +// the nfa-only mode, but at some turnover point the dfa-nfa mode +// will start to win again. Unfortunately, that is not what happened. +// Instead there was no noticeable change in the bench results, so +// I've opted to just do the more conservative anchor optimization. +// +bench_captures!(short_haystack_1x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + String::from("aaaabbbbccccbbbdddd")); +bench_captures!(short_haystack_2x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(2).collect::(), + repeat("dddd").take(2).collect::(), + )); +bench_captures!(short_haystack_3x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(3).collect::(), + repeat("dddd").take(3).collect::(), + )); +bench_captures!(short_haystack_4x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(4).collect::(), + repeat("dddd").take(4).collect::(), + )); +bench_captures!(short_haystack_10x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(10).collect::(), + repeat("dddd").take(10).collect::(), + )); +bench_captures!(short_haystack_100x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(100).collect::(), + repeat("dddd").take(100).collect::(), + )); +bench_captures!(short_haystack_1000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(1000).collect::(), + repeat("dddd").take(1000).collect::(), + )); +bench_captures!(short_haystack_10000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(10000).collect::(), + repeat("dddd").take(10000).collect::(), + )); +bench_captures!(short_haystack_100000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(100000).collect::(), + repeat("dddd").take(100000).collect::(), + )); +bench_captures!(short_haystack_1000000x, + Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2, + format!("{}bbbbccccbbb{}", + repeat("aaaa").take(1000000).collect::(), + repeat("dddd").take(1000000).collect::(), + )); diff --git a/ci/run-kcov b/ci/run-kcov index 092123775d..0ef842c319 100755 --- a/ci/run-kcov +++ b/ci/run-kcov @@ -14,15 +14,10 @@ tests=( regex ) tmpdir=$(mktemp -d) -with_plugin= coveralls_id= while true; do case "$1" in - --with-plugin) - with_plugin=yes - shift - ;; --coveralls-id) coveralls_id="$2" shift 2 @@ -33,15 +28,6 @@ while true; do esac done -if [ -n "$with_plugin" ]; then - cargo test --manifest-path regex_macros/Cargo.toml --no-run --verbose - kcov \ - --verify \ - --include-pattern '/regex/src/' \ - "$tmpdir/plugin" \ - $(ls -t ./regex_macros/target/debug/plugin-* | head -n1) -fi - cargo test --no-run --verbose --jobs 4 for t in ${tests[@]}; do kcov \ diff --git a/ci/script.sh b/ci/script.sh index 2b48aa5d7f..a0c049bdcf 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -7,6 +7,16 @@ set -e # Builds the regex crate and runs tests. cargo build --verbose cargo doc --verbose + +# If we're testing on an older version of Rust, then only check that we +# can build the crate. This is because the dev dependencies might be updated +# more frequently, and therefore might require a newer version of Rust. +# +# This isn't ideal. It's a compromise. +if [ "$TRAVIS_RUST_VERSION" = "1.12.0" ]; then + exit +fi + if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then cargo build --verbose --manifest-path regex-debug/Cargo.toml RUSTFLAGS="-C target-feature=+ssse3" cargo test --verbose --features 'simd-accel pattern' --jobs 4 @@ -23,8 +33,8 @@ cargo doc --verbose --manifest-path regex-syntax/Cargo.toml # Run tests on regex-capi crate. cargo build --verbose --manifest-path regex-capi/Cargo.toml -(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../target/debug ./test) -(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../target/debug ./iter) +(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test) +(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter) # Make sure benchmarks compile. Don't run them though because they take a # very long time. diff --git a/regex-capi/Cargo.toml b/regex-capi/Cargo.toml index 6dd59b6f65..efbb20cc7f 100644 --- a/regex-capi/Cargo.toml +++ b/regex-capi/Cargo.toml @@ -10,6 +10,7 @@ homepage = "https://github.com/rust-lang/regex" description = """ A C API for Rust's regular expression library. """ +workspace = ".." [lib] name = "rure" diff --git a/regex-capi/README.md b/regex-capi/README.md index c934895fbe..af59979773 100644 --- a/regex-capi/README.md +++ b/regex-capi/README.md @@ -12,7 +12,7 @@ The header file (`includes/rure.h`) serves as the primary API documentation of this library. Types and flags are documented first, and functions follow. The syntax and possibly other useful things are documented in the Rust -API documentation: http://doc.rust-lang.org/regex/regex/index.html +API documentation: https://docs.rs/regex Examples diff --git a/regex-capi/ctest/compile b/regex-capi/ctest/compile index a2e4d5129e..6bbf6aed89 100755 --- a/regex-capi/ctest/compile +++ b/regex-capi/ctest/compile @@ -3,6 +3,6 @@ set -ex cargo build --manifest-path ../Cargo.toml -gcc -DDEBUG -o test test.c -ansi -Wall -I../include -L../target/debug -lrure +gcc -DDEBUG -o test test.c -ansi -Wall -I../include -L../../target/debug -lrure # If you're using librure.a, then you'll need to link other stuff: # -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure diff --git a/regex-capi/examples/compile b/regex-capi/examples/compile index 6a05bb6ab4..a7566b791d 100755 --- a/regex-capi/examples/compile +++ b/regex-capi/examples/compile @@ -4,6 +4,6 @@ set -ex # N.B. Add `--release` flag to `cargo build` to make the example run faster. cargo build --manifest-path ../Cargo.toml -gcc -O3 -DDEBUG -o iter iter.c -ansi -Wall -I../include -L../target/debug -lrure +gcc -O3 -DDEBUG -o iter iter.c -ansi -Wall -I../include -L../../target/debug -lrure # If you're using librure.a, then you'll need to link other stuff: # -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml index cebad7ca4b..54b2bb511d 100644 --- a/regex-debug/Cargo.toml +++ b/regex-debug/Cargo.toml @@ -5,15 +5,14 @@ version = "0.1.0" authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" repository = "https://github.com/rust-lang/regex" -documentation = "http://doc.rust-lang.org/regex" +documentation = "https://docs.rs/regex" homepage = "https://github.com/rust-lang/regex" description = "A tool useful for debugging regular expressions." +workspace = ".." [dependencies] -docopt = "0.6" +docopt = "0.8" regex = { version = "0.2", path = ".." } regex-syntax = { version = "0.4.0", path = "../regex-syntax" } -rustc-serialize = "0.3" - -[profile.release] -debug = true +serde = "1" +serde_derive = "1" diff --git a/regex-debug/src/main.rs b/regex-debug/src/main.rs index d19a6c7a99..f31dc22a9c 100644 --- a/regex-debug/src/main.rs +++ b/regex-debug/src/main.rs @@ -1,7 +1,9 @@ extern crate docopt; extern crate regex; extern crate regex_syntax as syntax; -extern crate rustc_serialize; +extern crate serde; +#[macro_use] +extern crate serde_derive; use std::error; use std::io::{self, Write}; @@ -46,7 +48,7 @@ Options: constructed by the literals found. "; -#[derive(RustcDecodable)] +#[derive(Deserialize)] struct Args { cmd_ast: bool, cmd_prefixes: bool, @@ -74,7 +76,7 @@ type Result = result::Result>; fn main() { let mut args: Args = Docopt::new(USAGE) - .and_then(|d| d.decode()) + .and_then(|d| d.deserialize()) .unwrap_or_else(|e| e.exit()); if args.flag_dfa_reverse { args.flag_dfa = true; diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 4ff62f546b..f038786b99 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -4,10 +4,11 @@ version = "0.4.1" #:version authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" repository = "https://github.com/rust-lang/regex" -documentation = "http://doc.rust-lang.org/regex/regex_syntax/index.html" +documentation = "https://docs.rs/regex-syntax" homepage = "https://github.com/rust-lang/regex" description = "A regular expression parser." +workspace = ".." [dev-dependencies] -quickcheck = { version = "0.5", default-features = false } -rand = "0.3.15" +quickcheck = { version = "0.6", default-features = false } +rand = "0.4" diff --git a/regex_macros/Cargo.toml b/regex_macros/Cargo.toml deleted file mode 100644 index 7fd4ecd7a5..0000000000 --- a/regex_macros/Cargo.toml +++ /dev/null @@ -1,35 +0,0 @@ -[package] -name = "regex_macros" -version = "0.2.0" -authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" -repository = "https://github.com/rust-lang/regex" -homepage = "https://github.com/rust-lang/regex" -description = """ -An implementation of statically compiled regular expressions for Rust. - -Unless you specifically need compile time regular expressions or a matching -engine that is guaranteed not to allocate, you should temporarily prefer using -the plain regex crate (since it is almost always faster). -""" - -[lib] -name = "regex_macros" -plugin = true - -[dependencies.regex] -path = ".." -version = "0.2.0" -features = ["pattern"] - -[dependencies.regex-syntax] -path = "../regex-syntax" -version = "0.4.0" - -[dev-dependencies] -# For generating random test data. -rand = "0.3.15" - -[[test]] -path = "../tests/test_plugin.rs" -name = "plugin" diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs deleted file mode 100644 index c8353f1780..0000000000 --- a/regex_macros/src/lib.rs +++ /dev/null @@ -1,600 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! This crate provides the `regex!` macro. Its use is documented in the -//! `regex` crate. - -#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", - html_favicon_url = "http://www.rust-lang.org/favicon.ico", - html_root_url = "http://doc.rust-lang.org/nightly/")] - -#![feature(plugin_registrar, quote, rustc_private)] - -extern crate regex; -extern crate regex_syntax; -extern crate rustc_plugin; -extern crate syntax; - -use std::collections::BTreeMap; -use std::usize; - -use syntax::ast; -use syntax::codemap; -use syntax::tokenstream; -use syntax::ext::build::AstBuilder; -use syntax::ext::base::{ExtCtxt, MacResult, MacEager, DummyResult}; -use syntax::parse::token; -use syntax::print::pprust; -use syntax::fold::Folder; -use syntax::ptr::P; - -use rustc_plugin::Registry; - -use regex::internal::{Compiler, EmptyLook, Inst, Program}; -use regex_syntax::Expr; - -/// For the `regex!` syntax extension. Do not use. -#[plugin_registrar] -#[doc(hidden)] -pub fn plugin_registrar(reg: &mut Registry) { - reg.register_macro("regex", native); -} - -/// Generates specialized code for the Pike VM for a particular regular -/// expression. -/// -/// There are two primary differences between the code generated here and the -/// general code in vm.rs. -/// -/// 1. All heap allocation is removed. Sized vector types are used instead. -/// Care must be taken to make sure that these vectors are not copied -/// gratuitously. (If you're not sure, run the benchmarks. They will yell -/// at you if you do.) -/// 2. The main `match instruction { ... }` expressions are replaced with more -/// direct `match pc { ... }`. The generators can be found in -/// `step_insts` and `add_insts`. -/// -/// It is strongly recommended to read the dynamic implementation in vm.rs -/// first before trying to understand the code generator. The implementation -/// strategy is identical and vm.rs has comments and will be easier to follow. -fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[tokenstream::TokenTree]) - -> Box { - let regex = match parse(cx, tts) { - Some(r) => r, - // error is logged in 'parse' with cx.span_err - None => return DummyResult::any(sp), - }; - // We use the largest possible size limit because this is happening at - // compile time. We trust the programmer. - let expr = match Expr::parse(®ex) { - Ok(expr) => expr, - Err(err) => { - cx.span_err(sp, &err.to_string()); - return DummyResult::any(sp) - } - }; - let prog = match Compiler::new().size_limit(usize::MAX).compile(&[expr]) { - Ok(re) => re, - Err(err) => { - cx.span_err(sp, &err.to_string()); - return DummyResult::any(sp) - } - }; - let names = prog.captures.iter().cloned().collect(); - let mut gen = NfaGen { - cx: cx, - sp: sp, - prog: prog, - names: names, - original: regex, - }; - MacEager::expr(gen.code()) -} - -struct NfaGen<'cx, 'a: 'cx> { - cx: &'cx ExtCtxt<'a>, - sp: codemap::Span, - prog: Program, - names: Vec>, - original: String, -} - -impl<'a, 'cx> NfaGen<'a, 'cx> { - fn code(&mut self) -> P { - // Most or all of the following things are used in the quasiquoted - // expression returned. - let num_cap_locs = 2 * self.prog.captures.len(); - let num_insts = self.prog.len(); - let cap_names = self.vec_expr(self.names.iter(), - &mut |cx, name| match *name { - Some(ref name) => { - let name = &**name; - quote_expr!(cx, Some($name)) - } - None => cx.expr_none(self.sp), - } - ); - let capture_name_idx = { - let mut capture_name_idx = BTreeMap::new(); - for (i, name) in self.names.iter().enumerate() { - if let Some(ref name) = *name { - capture_name_idx.insert(name.to_owned(), i); - } - } - self.vec_expr(capture_name_idx.iter(), - &mut |cx, (name, group_idx)| - quote_expr!(cx, ($name, $group_idx)) - ) - }; - - let is_anchored_start = self.prog.is_anchored_start; - let step_insts = self.step_insts(); - let add_insts = self.add_insts(); - let regex = &*self.original; - - quote_expr!(self.cx, { -// When `regex!` is bound to a name that is not used, we have to make sure -// that dead_code warnings don't bubble up to the user from the generated -// code. Therefore, we suppress them by allowing dead_code. The effect is that -// the user is only warned about *their* unused variable/code, and not the -// unused code generated by regex!. See #14185 for an example. -#[allow(dead_code)] -static CAPTURES: &'static [Option<&'static str>] = &$cap_names; -#[allow(dead_code)] -static CAPTURE_NAME_IDX: &'static [(&'static str, usize)] = &$capture_name_idx; - -#[allow(dead_code)] -fn exec<'t>( - mut caps: &mut [Option], - input: &'t str, - start: usize, -) -> bool { - #![allow(unused_imports)] - #![allow(unused_mut)] - - use regex::internal::{Char, CharInput, InputAt, Input, Inst}; - - let input = CharInput::new(input.as_bytes()); - let at = input.at(start); - return Nfa { - input: input, - ncaps: caps.len(), - }.exec(&mut NfaThreads::new(), &mut caps, at); - - struct Nfa<'t> { - input: CharInput<'t>, - ncaps: usize, - } - - impl<'t> Nfa<'t> { - #[allow(unused_variables)] - fn exec( - &mut self, - mut q: &mut NfaThreads, - mut caps: &mut [Option], - mut at: InputAt, - ) -> bool { - let mut matched = false; - let (mut clist, mut nlist) = (&mut q.clist, &mut q.nlist); - clist.empty(); nlist.empty(); -'LOOP: loop { - if clist.size == 0 { - if matched || (!at.is_start() && $is_anchored_start) { - break; - } - // TODO: Prefix matching... Hmm. - // Prefix matching now uses a DFA, so I think this is - // going to require encoding that DFA statically. - } - if clist.size == 0 || (!$is_anchored_start && !matched) { - self.add(clist, &mut caps, 0, at); - } - let at_next = self.input.at(at.next_pos()); - for i in 0..clist.size { - let pc = clist.pc(i); - let tcaps = clist.caps(i); - if self.step(nlist, caps, tcaps, pc, at, at_next) { - matched = true; - if caps.len() == 0 { - break 'LOOP; - } - break; - } - } - if at.char().is_none() { - break; - } - at = at_next; - ::std::mem::swap(&mut clist, &mut nlist); - nlist.empty(); - } - matched - } - - // Sometimes `nlist` is never used (for empty regexes). - #[allow(unused_variables)] - #[inline] - fn step( - &self, - nlist: &mut Threads, - caps: &mut [Option], - thread_caps: &mut [Option], - pc: usize, - at: InputAt, - at_next: InputAt, - ) -> bool { - $step_insts; - false - } - - fn add( - &self, - nlist: &mut Threads, - thread_caps: &mut [Option], - pc: usize, - at: InputAt, - ) { - if nlist.contains(pc) { - return; - } - let ti = nlist.add(pc); - $add_insts - } - } - - struct NfaThreads { - clist: Threads, - nlist: Threads, - } - - struct Threads { - dense: [Thread; $num_insts], - sparse: [usize; $num_insts], - size: usize, - } - - struct Thread { - pc: usize, - caps: [Option; $num_cap_locs], - } - - impl NfaThreads { - fn new() -> NfaThreads { - NfaThreads { - clist: Threads::new(), - nlist: Threads::new(), - } - } - - fn swap(&mut self) { - ::std::mem::swap(&mut self.clist, &mut self.nlist); - } - } - - impl Threads { - fn new() -> Threads { - Threads { - // These unsafe blocks are used for performance reasons, as it - // gives us a zero-cost initialization of a sparse set. The - // trick is described in more detail here: - // http://research.swtch.com/sparse - // The idea here is to avoid initializing threads that never - // need to be initialized, particularly for larger regexs with - // a lot of instructions. - dense: unsafe { ::std::mem::uninitialized() }, - sparse: unsafe { ::std::mem::uninitialized() }, - size: 0, - } - } - - #[inline] - fn add(&mut self, pc: usize) -> usize { - let i = self.size; - self.dense[i].pc = pc; - self.sparse[pc] = i; - self.size += 1; - i - } - - #[inline] - fn thread(&mut self, i: usize) -> &mut Thread { - &mut self.dense[i] - } - - #[inline] - fn contains(&self, pc: usize) -> bool { - let s = unsafe { ::std::ptr::read_volatile(&self.sparse[pc]) }; - s < self.size && self.dense[s].pc == pc - } - - #[inline] - fn empty(&mut self) { - self.size = 0; - } - - #[inline] - fn pc(&self, i: usize) -> usize { - self.dense[i].pc - } - - #[inline] - fn caps<'r>(&'r mut self, i: usize) -> &'r mut [Option] { - &mut self.dense[i].caps - } - } -} - -::regex::Regex(::regex::internal::_Regex::Plugin(::regex::internal::Plugin { - original: $regex, - names: &CAPTURES, - groups: &CAPTURE_NAME_IDX, - prog: exec, -})) - }) - } - - // Generates code for the `add` method, which is responsible for adding - // zero-width states to the next queue of states to visit. - fn add_insts(&self) -> P { - let arms = self.prog.iter().enumerate().map(|(pc, inst)| { - let body = match *inst { - Inst::EmptyLook(ref inst) => { - let nextpc = inst.goto; - match inst.look { - EmptyLook::StartLine => { - quote_expr!(self.cx, { - let prev = self.input.previous_char(at); - if prev.is_none() || prev == '\n' { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::EndLine => { - quote_expr!(self.cx, { - if at.char().is_none() || at.char() == '\n' { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::StartText => { - quote_expr!(self.cx, { - let prev = self.input.previous_char(at); - if prev.is_none() { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::EndText => { - quote_expr!(self.cx, { - if at.char().is_none() { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::WordBoundary - | EmptyLook::NotWordBoundary => { - let m = if inst.look == EmptyLook::WordBoundary { - quote_expr!(self.cx, { w1 ^ w2 }) - } else { - quote_expr!(self.cx, { !(w1 ^ w2) }) - }; - quote_expr!(self.cx, { - let prev = self.input.previous_char(at); - let w1 = prev.is_word_char(); - let w2 = at.char().is_word_char(); - if $m { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::WordBoundaryAscii - | EmptyLook::NotWordBoundaryAscii => { - unreachable!() - } - } - } - Inst::Save(ref inst) => { - let nextpc = inst.goto; - let slot = inst.slot; - quote_expr!(self.cx, { - if $slot >= self.ncaps { - self.add(nlist, thread_caps, $nextpc, at); - } else { - let old = thread_caps[$slot]; - thread_caps[$slot] = Some(at.pos()); - self.add(nlist, thread_caps, $nextpc, at); - thread_caps[$slot] = old; - } - }) - } - Inst::Split(ref inst) => { - let (x, y) = (inst.goto1, inst.goto2); - quote_expr!(self.cx, { - self.add(nlist, thread_caps, $x, at); - self.add(nlist, thread_caps, $y, at); - }) - } - // For Match, Char, Ranges - _ => quote_expr!(self.cx, { - let mut t = &mut nlist.thread(ti); - for (slot, val) in t.caps.iter_mut().zip(thread_caps.iter()) { - *slot = *val; - } - }), - }; - self.arm_inst(pc, body) - }).collect::>(); - self.match_insts(arms) - } - - // Generates the code for the `step` method, which processes all states - // in the current queue that consume a single character. - fn step_insts(&self) -> P { - let arms = self.prog.iter().enumerate().map(|(pc, inst)| { - let body = match *inst { - Inst::Match(_) => quote_expr!(self.cx, { - for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) { - *slot = *val; - } - return true; - }), - Inst::Char(ref inst) => { - let nextpc = inst.goto; - let c = inst.c; - quote_expr!(self.cx, { - if $c == at.char() { - self.add(nlist, thread_caps, $nextpc, at_next); - } - return false; - }) - } - Inst::Ranges(ref inst) => { - let match_class = self.match_class(&inst.ranges); - let nextpc = inst.goto; - quote_expr!(self.cx, { - let mut c = at.char(); - if let Some(c) = c.as_char() { - if $match_class { - self.add(nlist, thread_caps, $nextpc, at_next); - } - } - return false; - }) - } - // EmptyLook, Save, Jump, Split - _ => quote_expr!(self.cx, { return false; }), - }; - self.arm_inst(pc, body) - }).collect::>(); - - self.match_insts(arms) - } - - // Translates a character class into a match expression. - // This avoids a binary search (and is hopefully replaced by a jump - // table). - fn match_class(&self, ranges: &[(char, char)]) -> P { - let mut arms = ranges.iter().map(|&(start, end)| { - let pat = self.cx.pat( - self.sp, ast::PatKind::Range( - quote_expr!(self.cx, $start), quote_expr!(self.cx, $end))); - self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true)) - }).collect::>(); - - arms.push(self.wild_arm_expr(quote_expr!(self.cx, false))); - let match_on = quote_expr!(self.cx, c); - self.cx.expr_match(self.sp, match_on, arms) - } - - // Generates code for checking a literal prefix of the search string. - // The code is only generated if the regex *has* a literal prefix. - // Otherwise, a no-op is returned. - // fn check_prefix(&self) -> P { - // if self.prog.prefixes.len() == 0 { - // self.empty_block() - // } else { - // quote_expr!(self.cx, - // if clist.size == 0 { - // let haystack = &self.input.as_bytes()[self.ic..]; - // match find_prefix(prefix_bytes, haystack) { - // None => break, - // Some(i) => { - // self.ic += i; - // next_ic = self.chars.set(self.ic); - // } - // } - // } - // ) - // } - // } - - // Builds a `match pc { ... }` expression from a list of arms, specifically - // for matching the current program counter with an instruction. - // A wild-card arm is automatically added that executes a no-op. It will - // never be used, but is added to satisfy the compiler complaining about - // non-exhaustive patterns. - fn match_insts(&self, mut arms: Vec) -> P { - arms.push(self.wild_arm_expr(self.empty_block())); - self.cx.expr_match(self.sp, quote_expr!(self.cx, pc), arms) - } - - fn empty_block(&self) -> P { - quote_expr!(self.cx, {}) - } - - // Creates a match arm for the instruction at `pc` with the expression - // `body`. - fn arm_inst(&self, pc: usize, body: P) -> ast::Arm { - let pc_pat = self.cx.pat_lit(self.sp, quote_expr!(self.cx, $pc)); - - self.cx.arm(self.sp, vec!(pc_pat), body) - } - - // Creates a wild-card match arm with the expression `body`. - fn wild_arm_expr(&self, body: P) -> ast::Arm { - ast::Arm { - attrs: vec!(), - pats: vec!(P(ast::Pat{ - id: ast::DUMMY_NODE_ID, - span: self.sp, - node: ast::PatKind::Wild, - })), - guard: None, - body: body, - } - } - - // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr` - // on each element in `xs`. - fn vec_expr>( - &self, - xs: It, - to_expr: &mut FnMut(&ExtCtxt, T) -> P, - ) -> P { - let exprs = xs.map(|x| to_expr(self.cx, x)).collect(); - self.cx.expr_vec(self.sp, exprs) - } -} - -/// Looks for a single string literal and returns it. -/// Otherwise, logs an error with cx.span_err and returns None. -fn parse(cx: &mut ExtCtxt, tts: &[tokenstream::TokenTree]) -> Option { - let mut parser = cx.new_parser_from_tts(tts); - if let Ok(expr) = parser.parse_expr() { - let entry = cx.expander().fold_expr(expr); - let regex = match entry.node { - ast::ExprKind::Lit(ref lit) => { - match lit.node { - ast::LitKind::Str(ref s, _) => s.to_string(), - _ => { - cx.span_err(entry.span, &format!( - "expected string literal but got `{}`", - pprust::lit_to_string(&**lit))); - return None - } - } - } - _ => { - cx.span_err(entry.span, &format!( - "expected string literal but got `{}`", - pprust::expr_to_string(&*entry))); - return None - } - }; - if !parser.eat(&token::Eof) { - cx.span_err(parser.span, "only one string literal allowed"); - return None; - } - Some(regex) - } else { - cx.parse_sess().span_diagnostic.err("failure parsing token tree"); - None - } -} diff --git a/src/exec.rs b/src/exec.rs index 458e47d3b0..d12a725cf0 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -554,12 +554,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> { }) } MatchType::Dfa => { - match self.find_dfa_forward(text, start) { - dfa::Result::Match((s, e)) => { - self.captures_nfa_with_match(slots, text, s, e) + if self.ro.nfa.is_anchored_start { + self.captures_nfa(slots, text, start) + } else { + match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => { + self.captures_nfa_with_match(slots, text, s, e) + } + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), } - dfa::Result::NoMatch(_) => None, - dfa::Result::Quit => self.captures_nfa(slots, text, start), } } MatchType::DfaAnchoredReverse => { diff --git a/src/input.rs b/src/input.rs index 87bf72d72f..3d87257c01 100644 --- a/src/input.rs +++ b/src/input.rs @@ -383,15 +383,6 @@ impl Char { None | Some(_) => false, } } - - /// Converts the character to a real primitive `char`. - /// - /// If the character is absent, then `None` is returned. - pub fn as_char(self) -> Option { - // This is only used in the `regex!` macro because it expands char - // classes into `match` expressions (instead of binary search). - char::from_u32(self.0) - } } impl From for Char { diff --git a/src/lib.rs b/src/lib.rs index 075ac3f097..0dd551f23f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,7 +56,7 @@ it to match anywhere in the text. Anchors can be used to ensure that the full text matches an expression. This example also demonstrates the utility of -[raw strings](https://doc.rust-lang.org/stable/reference.html#raw-string-literals) +[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals) in Rust, which are just like regular strings except they are prefixed with an `r` and do not process any escape sequences. For example, `"\\d"` is the same @@ -641,7 +641,6 @@ mod pikevm; mod prog; mod re_builder; mod re_bytes; -mod re_plugin; mod re_set; mod re_trait; mod re_unicode; @@ -652,9 +651,9 @@ mod simd_accel; mod simd_accel; mod sparse; -/// The `internal` module exists to support the `regex!` macro and other -/// suspicious activity, such as testing different matching engines and -/// supporting the `regex-debug` CLI utility. +/// The `internal` module exists to support suspicious activity, such as +/// testing different matching engines and supporting the `regex-debug` CLI +/// utility. #[doc(hidden)] pub mod internal { pub use compile::Compiler; @@ -662,6 +661,4 @@ pub mod internal { pub use input::{Char, Input, CharInput, InputAt}; pub use literals::LiteralSearcher; pub use prog::{Program, Inst, EmptyLook, InstRanges}; - pub use re_plugin::Plugin; - pub use re_unicode::_Regex; } diff --git a/src/literals.rs b/src/literals.rs index 29cb7d3d60..aa80332231 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -8,6 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use std::cmp; use std::mem; use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; @@ -21,25 +22,13 @@ use simd_accel::teddy128::{Teddy, is_teddy_128_available}; /// A prefix extracted from a compiled regular expression. /// /// A regex prefix is a set of literal strings that *must* be matched at the -/// beginning of a regex in order for the entire regex to match. -/// -/// There are a variety of ways to efficiently scan the search text for a -/// prefix. Currently, there are three implemented: -/// -/// 1. The prefix is a single byte. Just use memchr. -/// 2. If the prefix is a set of two or more single byte prefixes, then -/// a single sparse map is created. Checking if there is a match is a lookup -/// in this map for each byte in the search text. -/// 3. In all other cases, build an Aho-Corasick automaton. -/// -/// It's possible that there's room here for other substring algorithms, -/// such as Boyer-Moore for single-set prefixes greater than 1, or Rabin-Karp -/// for small sets of same-length prefixes. +/// beginning of a regex in order for the entire regex to match. Similarly +/// for a regex suffix. #[derive(Clone, Debug)] pub struct LiteralSearcher { complete: bool, - lcp: MemchrSearch, - lcs: MemchrSearch, + lcp: FreqyPacked, + lcs: FreqyPacked, matcher: Matcher, } @@ -50,12 +39,13 @@ enum Matcher { /// A set of four or more single byte literals. Bytes(SingleByteSet), /// A single substring, find using memchr and frequency analysis. - SingleMemchr(MemchrSearch), + FreqyPacked(FreqyPacked), /// A single substring, find using Boyer-Moore. - SingleBoyerMoore(BoyerMooreSearch), + BoyerMoore(BoyerMooreSearch), /// An Aho-Corasick automaton. AC(FullAcAutomaton), - /// A simd accelerated multiple string matcher. + /// A simd accelerated multiple string matcher. Used only for a small + /// number of small literals. Teddy128(Teddy), } @@ -81,8 +71,8 @@ impl LiteralSearcher { let complete = lits.all_complete(); LiteralSearcher { complete: complete, - lcp: MemchrSearch::new(lits.longest_common_prefix().to_vec()), - lcs: MemchrSearch::new(lits.longest_common_suffix().to_vec()), + lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()), + lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()), matcher: matcher, } } @@ -104,8 +94,8 @@ impl LiteralSearcher { match self.matcher { Empty => Some((0, 0)), Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), - SingleMemchr(ref s) => s.find(haystack).map(|i| (i, i + s.len())), - SingleBoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())), AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)), Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)), } @@ -142,8 +132,8 @@ impl LiteralSearcher { match self.matcher { Matcher::Empty => LiteralIter::Empty, Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), - Matcher::SingleMemchr(ref s) => LiteralIter::Single(&s.pat), - Matcher::SingleBoyerMoore(ref s) => LiteralIter::Single(&s.pattern), + Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat), + Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern), Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()), Matcher::Teddy128(ref ted) => { LiteralIter::Teddy128(ted.patterns()) @@ -152,12 +142,12 @@ impl LiteralSearcher { } /// Returns a matcher for the longest common prefix of this matcher. - pub fn lcp(&self) -> &MemchrSearch { + pub fn lcp(&self) -> &FreqyPacked { &self.lcp } /// Returns a matcher for the longest common suffix of this matcher. - pub fn lcs(&self) -> &MemchrSearch { + pub fn lcs(&self) -> &FreqyPacked { &self.lcs } @@ -172,8 +162,8 @@ impl LiteralSearcher { match self.matcher { Empty => 0, Bytes(ref sset) => sset.dense.len(), - SingleMemchr(_) => 1, - SingleBoyerMoore(_) => 1, + FreqyPacked(_) => 1, + BoyerMoore(_) => 1, AC(ref aut) => aut.len(), Teddy128(ref ted) => ted.len(), } @@ -185,8 +175,8 @@ impl LiteralSearcher { match self.matcher { Empty => 0, Bytes(ref sset) => sset.approximate_size(), - SingleMemchr(ref single) => single.approximate_size(), - SingleBoyerMoore(ref single) => single.approximate_size(), + FreqyPacked(ref single) => single.approximate_size(), + BoyerMoore(ref single) => single.approximate_size(), AC(ref aut) => aut.heap_bytes(), Teddy128(ref ted) => ted.approximate_size(), } @@ -223,9 +213,9 @@ impl Matcher { if lits.literals().len() == 1 { let lit = lits.literals()[0].to_vec(); if BoyerMooreSearch::should_use(lit.as_slice()) { - return Matcher::SingleBoyerMoore(BoyerMooreSearch::new(lit)); + return Matcher::BoyerMoore(BoyerMooreSearch::new(lit)); } else { - return Matcher::SingleMemchr(MemchrSearch::new(lit)); + return Matcher::FreqyPacked(FreqyPacked::new(lit)); } } let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii; @@ -386,11 +376,8 @@ impl SingleByteSet { } } -/// Provides an implementation of fast subtring search. -/// -/// This particular implementation is a Boyer-Moore variant, based on the -/// "tuned boyer moore" search from (Hume & Sunday, 1991). It has been tweaked -/// slightly to better use memchr. +/// Provides an implementation of fast subtring search using frequency +/// analysis. /// /// memchr is so fast that we do everything we can to keep the loop in memchr /// for as long as possible. The easiest way to do this is to intelligently @@ -399,10 +386,8 @@ impl SingleByteSet { /// haystack is far too expensive, we compute a set of fixed frequencies up /// front and hard code them in src/freqs.rs. Frequency analysis is done via /// scripts/frequencies.py. -/// -/// TODO(burntsushi): Add some amount of shifting to this. #[derive(Clone, Debug)] -pub struct MemchrSearch { +pub struct FreqyPacked { /// The pattern. pat: Vec, /// The number of Unicode characters in the pattern. This is useful for @@ -428,10 +413,10 @@ pub struct MemchrSearch { rare2i: usize, } -impl MemchrSearch { - fn new(pat: Vec) -> MemchrSearch { +impl FreqyPacked { + fn new(pat: Vec) -> FreqyPacked { if pat.is_empty() { - return MemchrSearch::empty(); + return FreqyPacked::empty(); } // Find the rarest two bytes. Try to make them distinct (but it's not @@ -456,7 +441,7 @@ impl MemchrSearch { let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap(); let char_len = char_len_lossy(&pat); - MemchrSearch { + FreqyPacked { pat: pat, char_len: char_len, rare1: rare1, @@ -466,8 +451,8 @@ impl MemchrSearch { } } - fn empty() -> MemchrSearch { - MemchrSearch { + fn empty() -> FreqyPacked { + FreqyPacked { pat: vec![], char_len: 0, rare1: 0, @@ -695,14 +680,33 @@ impl BoyerMooreSearch { /// I had trouble proving a useful turnover point. Hopefully, /// we can find one in the future. fn should_use(pattern: &[u8]) -> bool { - const CUTOFF_FREQ: usize = 242; - - // all the bytes must be more common than the cutoff. - pattern.iter().all(|c| freq_rank(*c) >= CUTOFF_FREQ) - // and the pattern must be long enough to be worthwhile. - // memchr will be faster on `e` because it is short - // even though e is quite common. - && pattern.len() > 7 + // The minimum pattern length required to use TBM. + const MIN_LEN: usize = 9; + // The minimum frequency rank (lower is rarer) that every byte in the + // pattern must have in order to use TBM. That is, if the pattern + // contains _any_ byte with a lower rank, then TBM won't be used. + const MIN_CUTOFF: usize = 150; + // The maximum frequency rank for any byte. + const MAX_CUTOFF: usize = 255; + // The scaling factor used to determine the actual cutoff frequency + // to use (keeping in mind that the minimum frequency rank is bounded + // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more + // likely to be used as the pattern grows longer. That is, longer + // patterns permit somewhat less frequent bytes than shorter patterns, + // under the assumption that TBM gets better as the pattern gets + // longer. + const LEN_CUTOFF_PROPORTION: usize = 4; + + let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION); + let cutoff = cmp::max( + MIN_CUTOFF, + MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank), + ); + // The pattern must be long enough to be worthwhile. e.g., memchr will + // be faster on `e` because it is short even though e is quite common. + pattern.len() > MIN_LEN + // all the bytes must be more common than the cutoff. + && pattern.iter().all(|c| freq_rank(*c) >= cutoff) } /// Check to see if there is a match at the given position @@ -863,7 +867,7 @@ fn freq_rank(b: u8) -> usize { #[cfg(test)] mod tests { - use super::{BoyerMooreSearch, MemchrSearch}; + use super::{BoyerMooreSearch, FreqyPacked}; // // Unit Tests @@ -994,9 +998,9 @@ mod tests { }; let bm_searcher = BoyerMooreSearch::new(needle.clone()); - let memchr_searcher = MemchrSearch::new(needle); + let freqy_memchr = FreqyPacked::new(needle); TestResult::from_bool( - bm_searcher.find(haystack) == memchr_searcher.find(haystack)) + bm_searcher.find(haystack) == freqy_memchr.find(haystack)) } fn qc_bm_finds_trailing_needle( diff --git a/src/re_plugin.rs b/src/re_plugin.rs deleted file mode 100644 index afd828921b..0000000000 --- a/src/re_plugin.rs +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use re_trait::{RegularExpression, Slot, Locations, as_slots}; - -/// Plugin is the compiler plugin's data structure. It declare some static -/// data (like capture groups and the original regex string), but defines its -/// matching engine as a simple function. -#[doc(hidden)] -pub struct Plugin { - #[doc(hidden)] - pub original: &'static str, - #[doc(hidden)] - pub names: &'static &'static [Option<&'static str>], - #[doc(hidden)] - pub groups: &'static &'static [(&'static str, usize)], - #[doc(hidden)] - pub prog: fn(&mut [Slot], &str, usize) -> bool, -} - -impl Copy for Plugin {} - -impl Clone for Plugin { - fn clone(&self) -> Plugin { - *self - } -} - -impl RegularExpression for Plugin { - type Text = str; - - fn slots_len(&self) -> usize { - self.names.len() * 2 - } - - fn next_after_empty(&self, text: &str, i: usize) -> usize { - let b = match text.as_bytes().get(i) { - None => return text.len() + 1, - Some(&b) => b, - }; - let inc = if b <= 0x7F { - 1 - } else if b <= 0b110_11111 { - 2 - } else if b <= 0b1110_1111 { - 3 - } else { - 4 - }; - i + inc - } - - fn shortest_match_at(&self, text: &str, start: usize) -> Option { - self.find_at(text, start).map(|(_, e)| e) - } - - fn is_match_at(&self, text: &str, start: usize) -> bool { - (self.prog)(&mut [], text, start) - } - - fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { - let mut slots = [None, None]; - (self.prog)(&mut slots, text, start); - match (slots[0], slots[1]) { - (Some(s), Some(e)) => Some((s, e)), - _ => None, - } - } - - fn read_captures_at<'t>( - &self, - locs: &mut Locations, - text: &'t str, - start: usize, - ) -> Option<(usize, usize)> { - let slots = as_slots(locs); - for slot in slots.iter_mut() { - *slot = None; - } - (self.prog)(slots, text, start); - match (slots[0], slots[1]) { - (Some(s), Some(e)) => Some((s, e)), - _ => None, - } - } -} diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 7129dfa4c7..7fb68257bb 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -22,7 +22,6 @@ use error::Error; use exec::{Exec, ExecNoSyncStr}; use expand::expand_str; use re_builder::unicode::RegexBuilder; -use re_plugin::Plugin; use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; /// Escapes all regular expression meta characters in `text`. @@ -135,21 +134,7 @@ impl<'t> From> for &'t str { /// assert_eq!(haystack.split(&re).collect::>(), vec!["a", "b", "c"]); /// ``` #[derive(Clone)] -pub struct Regex(#[doc(hidden)] pub _Regex); - -#[derive(Clone)] -#[doc(hidden)] -pub enum _Regex { - // The representation of `Regex` is exported to support the `regex!` - // syntax extension. Do not rely on it. - // - // See the comments for the `internal` module in `lib.rs` for a more - // detailed explanation for what `regex!` requires. - #[doc(hidden)] - Dynamic(Exec), - #[doc(hidden)] - Plugin(Plugin), -} +pub struct Regex(Exec); impl fmt::Display for Regex { /// Shows the original regular expression. @@ -168,7 +153,7 @@ impl fmt::Debug for Regex { #[doc(hidden)] impl From for Regex { fn from(exec: Exec) -> Regex { - Regex(_Regex::Dynamic(exec)) + Regex(exec) } } @@ -257,16 +242,7 @@ impl Regex { /// # } /// ``` pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { - match self.0 { - _Regex::Dynamic(ref exec) => { - let it = exec.searcher_str().find_iter(text); - Matches(MatchesInner::Dynamic(it)) - } - _Regex::Plugin(ref plug) => { - let it = plug.find_iter(text); - Matches(MatchesInner::Plugin(it)) - } - } + Matches(self.0.searcher_str().find_iter(text)) } /// Returns the capture groups corresponding to the leftmost-first @@ -337,7 +313,7 @@ impl Regex { self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, locs: locs, - named_groups: NamedGroups::from_regex(self) + named_groups: self.0.capture_name_idx().clone(), }) } @@ -370,16 +346,7 @@ impl Regex { &'r self, text: &'t str, ) -> CaptureMatches<'r, 't> { - match self.0 { - _Regex::Dynamic(ref exec) => { - let it = exec.searcher_str().captures_iter(text); - CaptureMatches(CaptureMatchesInner::Dynamic(it)) - } - _Regex::Plugin(ref plug) => { - let it = plug.captures_iter(text); - CaptureMatches(CaptureMatchesInner::Plugin(it)) - } - } + CaptureMatches(self.0.searcher_str().captures_iter(text)) } /// Returns an iterator of substrings of `text` delimited by a match of the @@ -663,12 +630,7 @@ impl Regex { text: &str, start: usize, ) -> Option { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().shortest_match_at(text, start) - } - _Regex::Plugin(ref plug) => plug.shortest_match_at(text, start), - } + self.0.searcher_str().shortest_match_at(text, start) } /// Returns the same as is_match, but starts the search at the given @@ -694,16 +656,9 @@ impl Regex { text: &'t str, start: usize, ) -> Option> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().find_at(text, start).map(|(s, e)| { - Match::new(text, s, e) - }) - } - _Regex::Plugin(ref plug) => { - plug.find_at(text, start).map(|(s, e)| Match::new(text, s, e)) - } - } + self.0.searcher_str().find_at(text, start).map(|(s, e)| { + Match::new(text, s, e) + }) } /// Returns the same as captures, but starts the search at the given @@ -719,16 +674,10 @@ impl Regex { text: &'t str, start: usize, ) -> Option> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().read_captures_at(locs, text, start) - .map(|(s, e)| Match::new(text, s, e)) - } - _Regex::Plugin(ref plug) => { - plug.read_captures_at(locs, text, start) - .map(|(s, e)| Match::new(text, s, e)) - } - } + self.0 + .searcher_str() + .read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) } } @@ -736,40 +685,24 @@ impl Regex { impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { - match self.0 { - _Regex::Dynamic(ref exec) => &exec.regex_strings()[0], - _Regex::Plugin(ref plug) => plug.original, - } + &self.0.regex_strings()[0] } /// Returns an iterator over the capture names. pub fn capture_names(&self) -> CaptureNames { - CaptureNames(match self.0 { - _Regex::Plugin(ref n) => _CaptureNames::Plugin(n.names.iter()), - _Regex::Dynamic(ref d) => { - _CaptureNames::Dynamic(d.capture_names().iter()) - } - }) + CaptureNames(self.0.capture_names().iter()) } /// Returns the number of captures. pub fn captures_len(&self) -> usize { - match self.0 { - _Regex::Plugin(ref n) => n.names.len(), - _Regex::Dynamic(ref d) => d.capture_names().len() - } + self.0.capture_names().len() } /// Returns an empty set of locations that can be reused in multiple calls /// to `read_captures`. #[doc(hidden)] pub fn locations(&self) -> Locations { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().locations() - } - _Regex::Plugin(ref plug) => plug.locations(), - } + self.0.searcher_str().locations() } } @@ -779,30 +712,20 @@ impl Regex { /// whole matched region) is always unnamed. /// /// `'r` is the lifetime of the compiled regular expression. -pub struct CaptureNames<'r>(_CaptureNames<'r>); - -enum _CaptureNames<'r> { - Plugin(::std::slice::Iter<'r, Option<&'static str>>), - Dynamic(::std::slice::Iter<'r, Option>) -} +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option>); impl<'r> Iterator for CaptureNames<'r> { type Item = Option<&'r str>; fn next(&mut self) -> Option> { - match self.0 { - _CaptureNames::Plugin(ref mut i) => i.next().cloned(), - _CaptureNames::Dynamic(ref mut i) => { - i.next().as_ref().map(|o| o.as_ref().map(|s| s.as_ref())) - } - } + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) } fn size_hint(&self) -> (usize, Option) { - match self.0 { - _CaptureNames::Plugin(ref i) => i.size_hint(), - _CaptureNames::Dynamic(ref i) => i.size_hint(), - } + self.0.size_hint() } } @@ -819,7 +742,7 @@ impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { - let text = self.finder.text(); + let text = self.finder.0.text(); match self.finder.next() { None => { if self.last >= text.len() { @@ -859,7 +782,7 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { } self.n -= 1; if self.n == 0 { - let text = self.splits.finder.text(); + let text = self.splits.finder.0.text(); Some(&text[self.splits.last..]) } else { self.splits.next() @@ -867,59 +790,6 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { } } -enum NamedGroups { - Plugin(&'static [(&'static str, usize)]), - Dynamic(Arc>), -} - -impl NamedGroups { - fn from_regex(regex: &Regex) -> NamedGroups { - match regex.0 { - _Regex::Plugin(ref plug) => NamedGroups::Plugin(plug.groups), - _Regex::Dynamic(ref exec) => { - NamedGroups::Dynamic(exec.capture_name_idx().clone()) - } - } - } - - fn pos(&self, name: &str) -> Option { - match *self { - NamedGroups::Plugin(groups) => { - groups.binary_search_by(|&(n, _)| n.cmp(name)) - .ok().map(|i| groups[i].1) - }, - NamedGroups::Dynamic(ref groups) => { - groups.get(name).cloned() - }, - } - } - - fn iter(& self) -> NamedGroupsIter { - match *self { - NamedGroups::Plugin(g) => NamedGroupsIter::Plugin(g.iter()), - NamedGroups::Dynamic(ref g) => NamedGroupsIter::Dynamic(g.iter()), - } - } -} - -enum NamedGroupsIter<'n> { - Plugin(::std::slice::Iter<'static, (&'static str, usize)>), - Dynamic(::std::collections::hash_map::Iter<'n, String, usize>), -} - -impl<'n> Iterator for NamedGroupsIter<'n> { - type Item = (&'n str, usize); - - fn next(&mut self) -> Option { - match *self { - NamedGroupsIter::Plugin(ref mut it) => it.next().cloned(), - NamedGroupsIter::Dynamic(ref mut it) => { - it.next().map(|(s, i)| (s.as_ref(), *i)) - } - } - } -} - /// Captures represents a group of captured strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent @@ -934,7 +804,7 @@ impl<'n> Iterator for NamedGroupsIter<'n> { pub struct Captures<'t> { text: &'t str, locs: Locations, - named_groups: NamedGroups, + named_groups: Arc>, } impl<'t> Captures<'t> { @@ -964,7 +834,7 @@ impl<'t> Captures<'t> { /// Returns the match for the capture group named `name`. If `name` isn't a /// valid capture group or didn't match anything, then `None` is returned. pub fn name(&self, name: &str) -> Option> { - self.named_groups.pos(name).and_then(|i| self.get(i)) + self.named_groups.get(name).and_then(|&i| self.get(i)) } /// An iterator that yields all capturing matches in the order in which @@ -1021,7 +891,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We'd like to show something nice here, even if it means an // allocation to build a reverse index. - let slot_to_name: HashMap = + let slot_to_name: HashMap<&usize, &String> = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); for (slot, m) in self.0.locs.iter().enumerate() { @@ -1107,34 +977,17 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct CaptureMatches<'r, 't>(CaptureMatchesInner<'r, 't>); - -enum CaptureMatchesInner<'r, 't> { - Dynamic(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::CaptureMatches<'t, Plugin>), -} +pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>); impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { - match self.0 { - CaptureMatchesInner::Dynamic(ref mut it) => { - let named = it.regex().capture_name_idx().clone(); - it.next().map(|locs| Captures { - text: it.text(), - locs: locs, - named_groups: NamedGroups::Dynamic(named), - }) - } - CaptureMatchesInner::Plugin(ref mut it) => { - it.next().map(|locs| Captures { - text: it.text(), - locs: locs, - named_groups: NamedGroups::Plugin(it.regex().groups), - }) - } - } + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs: locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) } } @@ -1145,35 +998,14 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct Matches<'r, 't>(MatchesInner<'r, 't>); - -enum MatchesInner<'r, 't> { - Dynamic(re_trait::Matches<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::Matches<'t, Plugin>), -} - -impl<'r, 't> Matches<'r, 't> { - fn text(&self) -> &'t str { - match self.0 { - MatchesInner::Dynamic(ref it) => it.text(), - MatchesInner::Plugin(ref it) => it.text(), - } - } -} +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>); impl<'r, 't> Iterator for Matches<'r, 't> { type Item = Match<'t>; fn next(&mut self) -> Option> { - let text = self.text(); - match self.0 { - MatchesInner::Dynamic(ref mut it) => { - it.next().map(|(s, e)| Match::new(text, s, e)) - } - MatchesInner::Plugin(ref mut it) => { - it.next().map(|(s, e)| Match::new(text, s, e)) - } - } + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) } } diff --git a/tests/replace.rs b/tests/replace.rs index 6e555b93ea..28b9df927e 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -39,3 +39,6 @@ replace!(match_at_start_replace_with_empty, replace_all, r"foo", "foobar", t!("" // See https://github.com/rust-lang/regex/issues/393 replace!(single_empty_match, replace, r"^", "bar", t!("foo"), "foobar"); + +// See https://github.com/rust-lang/regex/issues/399 +replace!(capture_longest_possible_name, replace_all, r"(.)", "b", t!("${1}a $1a"), "ba "); diff --git a/tests/test_plugin.rs b/tests/test_plugin.rs deleted file mode 100644 index b4bc973433..0000000000 --- a/tests/test_plugin.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![feature(plugin, test)] -#![plugin(regex_macros)] - -extern crate rand; -extern crate regex; -extern crate test; - -// Must come before other module definitions. -include!("macros_str.rs"); -include!("macros.rs"); - -mod api; -mod api_str; -mod crazy; -mod flags; -mod fowler; -mod multiline; -mod plugin; -mod replace; -mod suffix_reverse; -mod unicode;