diff --git a/Cargo.lock b/Cargo.lock index cce72a7b4..6083716fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,7 +5,6 @@ dependencies = [ "deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "globset 0.1.0", "grep 0.1.3", "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -73,11 +72,6 @@ dependencies = [ "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "glob" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "globset" version = "0.1.0" @@ -250,7 +244,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6cc484842f1e2884faf56f529f960cc12ad8c71ce96cc7abba0a067c98fee344" "checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef" -"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" "checksum libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)" = "408014cace30ee0f767b1c4517980646a573ec61a57957aeeabcac8ac0a02e8d" diff --git a/Cargo.toml b/Cargo.toml index b44469740..521cdad0a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,8 +46,5 @@ winapi = "0.2" [features] simd-accel = ["regex/simd-accel"] -[dev-dependencies] -glob = "0.2" - [profile.release] debug = true diff --git a/benches/README.md b/benches/README.md deleted file mode 100644 index 18cf91317..000000000 --- a/benches/README.md +++ /dev/null @@ -1,5 +0,0 @@ -These are internal microbenchmarks for tracking the peformance of individual -components inside of ripgrep. At the moment, they aren't heavily used. - -For performance benchmarks of ripgrep proper, see the sibling `benchsuite` -directory. diff --git a/ci/script.sh b/ci/script.sh index ee43f88e0..eca6c0f60 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -19,6 +19,10 @@ run_test_suite() { cargo clean --target $TARGET --verbose cargo build --target $TARGET --verbose cargo test --target $TARGET --verbose + cargo build --target $TARGET --verbose --manifest-path grep/Cargo.toml + cargo test --target $TARGET --verbose --manifest-path grep/Cargo.toml + cargo build --target $TARGET --verbose --manifest-path globset/Cargo.toml + cargo test --target $TARGET --verbose --manifest-path globset/Cargo.toml # sanity check the file type file target/$TARGET/debug/rg diff --git a/globset/Cargo.toml b/globset/Cargo.toml index cf63f397d..67a954dd9 100644 --- a/globset/Cargo.toml +++ b/globset/Cargo.toml @@ -3,6 +3,10 @@ name = "globset" version = "0.1.0" authors = ["Andrew Gallant "] +[lib] +name = "globset" +bench = false + [dependencies] aho-corasick = "0.5.3" fnv = "1.0" @@ -10,3 +14,6 @@ lazy_static = "0.2" log = "0.3" memchr = "0.1" regex = "0.1.77" + +[dev-dependencies] +glob = "0.2" diff --git a/globset/README.md b/globset/README.md new file mode 100644 index 000000000..f40b8aac5 --- /dev/null +++ b/globset/README.md @@ -0,0 +1,122 @@ +globset +======= +Cross platform single glob and glob set matching. Glob set matching is the +process of matching one or more glob patterns against a single candidate path +simultaneously, and returning all of the globs that matched. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.png)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/globset.svg)](https://crates.io/crates/globset) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/globset](https://docs.rs/globset) + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +globset = "0.1" +``` + +and this to your crate root: + +```rust +extern crate globset; +``` + +### Example: one glob + +This example shows how to match a single glob against a single file path. + +```rust +use globset::Glob; + +let glob = try!(Glob::new("*.rs")).compile_matcher(); + +assert!(glob.is_match("foo.rs")); +assert!(glob.is_match("foo/bar.rs")); +assert!(!glob.is_match("Cargo.toml")); +``` + +### Example: configuring a glob matcher + +This example shows how to use a `GlobBuilder` to configure aspects of match +semantics. In this example, we prevent wildcards from matching path separators. + +```rust +use globset::GlobBuilder; + +let glob = try!(GlobBuilder::new("*.rs") + .literal_separator(true).build()).compile_matcher(); + +assert!(glob.is_match("foo.rs")); +assert!(!glob.is_match("foo/bar.rs")); // no longer matches +assert!(!glob.is_match("Cargo.toml")); +``` + +### Example: match multiple globs at once + +This example shows how to match multiple glob patterns at once. + +```rust +use globset::{Glob, GlobSetBuilder}; + +let mut builder = GlobSetBuilder::new(); +// A GlobBuilder can be used to configure each glob's match semantics +// independently. +builder.add(try!(Glob::new("*.rs"))); +builder.add(try!(Glob::new("src/lib.rs"))); +builder.add(try!(Glob::new("src/**/foo.rs"))); +let set = try!(builder.build()); + +assert_eq!(set.matches("src/bar/baz/foo.rs"), vec![0, 2]); +``` + +### Performance + +This crate implements globs by converting them to regular expressions, and +executing them with the +[`regex`](https://github.com/rust-lang-nursery/regex) +crate. + +For single glob matching, performance of this crate should be roughly on par +with the performance of the +[`glob`](https://github.com/rust-lang-nursery/glob) +crate. (`*_regex` correspond to benchmarks for this library while `*_glob` +correspond to benchmarks for the `glob` library.) +Optimizations in the `regex` crate may propel this library past `glob`, +particularly when matching longer paths. + +``` +test ext_glob ... bench: 425 ns/iter (+/- 21) +test ext_regex ... bench: 175 ns/iter (+/- 10) +test long_glob ... bench: 182 ns/iter (+/- 11) +test long_regex ... bench: 173 ns/iter (+/- 10) +test short_glob ... bench: 69 ns/iter (+/- 4) +test short_regex ... bench: 83 ns/iter (+/- 2) +``` + +The primary performance advantage of this crate is when matching multiple +globs against a single path. With the `glob` crate, one must match each glob +synchronously, one after the other. In this crate, many can be matched +simultaneously. For example: + +``` +test many_short_glob ... bench: 1,063 ns/iter (+/- 47) +test many_short_regex_set ... bench: 186 ns/iter (+/- 11) +``` + +### Comparison with the [`glob`](https://github.com/rust-lang-nursery/glob) crate + +* Supports alternate "or" globs, e.g., `*.{foo,bar}`. +* Can match non-UTF-8 file paths correctly. +* Supports matching multiple globs at once. +* Doesn't provide a recursive directory iterator of matching file paths, + although I believe this crate should grow one eventually. +* Supports case insensitive and require-literal-separator match options, but + **doesn't** support the require-literal-leading-dot option. diff --git a/benches/bench.rs b/globset/benches/bench.rs similarity index 62% rename from benches/bench.rs rename to globset/benches/bench.rs index a71e149af..a151645d1 100644 --- a/benches/bench.rs +++ b/globset/benches/bench.rs @@ -5,39 +5,52 @@ tool itself, see the benchsuite directory. #![feature(test)] extern crate glob; +extern crate globset; #[macro_use] extern crate lazy_static; extern crate regex; extern crate test; +use globset::{Candidate, Glob, GlobMatcher, GlobSet, GlobSetBuilder}; + +const EXT: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt"; +const EXT_PAT: &'static str = "*.txt"; + const SHORT: &'static str = "some/needle.txt"; const SHORT_PAT: &'static str = "some/**/needle.txt"; const LONG: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt"; const LONG_PAT: &'static str = "some/**/needle.txt"; -#[allow(dead_code, unused_variables)] -#[path = "../src/glob.rs"] -mod reglob; - fn new_glob(pat: &str) -> glob::Pattern { glob::Pattern::new(pat).unwrap() } -fn new_reglob(pat: &str) -> reglob::Set { - let mut builder = reglob::SetBuilder::new(); - builder.add(pat).unwrap(); - builder.build().unwrap() +fn new_reglob(pat: &str) -> GlobMatcher { + Glob::new(pat).unwrap().compile_matcher() } -fn new_reglob_many(pats: &[&str]) -> reglob::Set { - let mut builder = reglob::SetBuilder::new(); +fn new_reglob_many(pats: &[&str]) -> GlobSet { + let mut builder = GlobSetBuilder::new(); for pat in pats { - builder.add(pat).unwrap(); + builder.add(Glob::new(pat).unwrap()); } builder.build().unwrap() } +#[bench] +fn ext_glob(b: &mut test::Bencher) { + let pat = new_glob(EXT_PAT); + b.iter(|| assert!(pat.matches(EXT))); +} + +#[bench] +fn ext_regex(b: &mut test::Bencher) { + let set = new_reglob(EXT_PAT); + let cand = Candidate::new(EXT); + b.iter(|| assert!(set.is_match_candidate(&cand))); +} + #[bench] fn short_glob(b: &mut test::Bencher) { let pat = new_glob(SHORT_PAT); @@ -47,7 +60,8 @@ fn short_glob(b: &mut test::Bencher) { #[bench] fn short_regex(b: &mut test::Bencher) { let set = new_reglob(SHORT_PAT); - b.iter(|| assert!(set.is_match(SHORT))); + let cand = Candidate::new(SHORT); + b.iter(|| assert!(set.is_match_candidate(&cand))); } #[bench] @@ -59,7 +73,8 @@ fn long_glob(b: &mut test::Bencher) { #[bench] fn long_regex(b: &mut test::Bencher) { let set = new_reglob(LONG_PAT); - b.iter(|| assert!(set.is_match(LONG))); + let cand = Candidate::new(LONG); + b.iter(|| assert!(set.is_match_candidate(&cand))); } const MANY_SHORT_GLOBS: &'static [&'static str] = &[ @@ -101,26 +116,3 @@ fn many_short_regex_set(b: &mut test::Bencher) { let set = new_reglob_many(MANY_SHORT_GLOBS); b.iter(|| assert_eq!(2, set.matches(MANY_SHORT_SEARCH).iter().count())); } - -// This is the fastest on my system (beating many_glob by about 2x). This -// suggests that a RegexSet needs quite a few regexes (or a larger haystack) -// in order for it to scale. -// -// TODO(burntsushi): come up with a benchmark that uses more complex patterns -// or a longer haystack. -#[bench] -fn many_short_regex_pattern(b: &mut test::Bencher) { - let pats: Vec<_> = MANY_SHORT_GLOBS.iter().map(|&s| { - let pat = reglob::Pattern::new(s).unwrap(); - regex::Regex::new(&pat.to_regex()).unwrap() - }).collect(); - b.iter(|| { - let mut count = 0; - for pat in &pats { - if pat.is_match(MANY_SHORT_SEARCH) { - count += 1; - } - } - assert_eq!(2, count); - }) -} diff --git a/globset/src/pattern.rs b/globset/src/glob.rs similarity index 84% rename from globset/src/pattern.rs rename to globset/src/glob.rs index 1eff726aa..279d52016 100644 --- a/globset/src/pattern.rs +++ b/globset/src/glob.rs @@ -2,14 +2,13 @@ use std::ffi::{OsStr, OsString}; use std::fmt; use std::iter; use std::ops::{Deref, DerefMut}; -use std::path::Path; +use std::path::{Path, is_separator}; use std::str; use regex; use regex::bytes::Regex; -use {Error, FILE_SEPARATORS, new_regex}; -use pathutil::path_bytes; +use {Candidate, Error, new_regex}; /// Describes a matching strategy for a particular pattern. /// @@ -54,7 +53,7 @@ pub enum MatchStrategy { impl MatchStrategy { /// Returns a matching strategy for the given pattern. - pub fn new(pat: &Pattern) -> MatchStrategy { + pub fn new(pat: &Glob) -> MatchStrategy { if let Some(lit) = pat.basename_literal() { MatchStrategy::BasenameLiteral(lit) } else if let Some(lit) = pat.literal() { @@ -73,19 +72,19 @@ impl MatchStrategy { } } -/// Pattern represents a successfully parsed shell glob pattern. +/// Glob represents a successfully parsed shell glob pattern. /// /// It cannot be used directly to match file paths, but it can be converted -/// to a regular expression string. +/// to a regular expression string or a matcher. #[derive(Clone, Debug, Eq, PartialEq)] -pub struct Pattern { +pub struct Glob { glob: String, re: String, - opts: PatternOptions, + opts: GlobOptions, tokens: Tokens, } -impl fmt::Display for Pattern { +impl fmt::Display for Glob { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.glob.fmt(f) } @@ -93,52 +92,55 @@ impl fmt::Display for Pattern { /// A matcher for a single pattern. #[derive(Clone, Debug)] -pub struct PatternMatcher { +pub struct GlobMatcher { /// The underlying pattern. - pat: Pattern, + pat: Glob, /// The pattern, as a compiled regex. re: Regex, } -impl PatternMatcher { +impl GlobMatcher { /// Tests whether the given path matches this pattern or not. pub fn is_match>(&self, path: P) -> bool { - self.re.is_match(&*path_bytes(path.as_ref())) + self.is_match_candidate(&Candidate::new(path.as_ref())) + } + + /// Tests whether the given path matches this pattern or not. + pub fn is_match_candidate(&self, path: &Candidate) -> bool { + self.re.is_match(&path.path) } } /// A strategic matcher for a single pattern. #[cfg(test)] #[derive(Clone, Debug)] -struct PatternStrategic { +struct GlobStrategic { /// The match strategy to use. strategy: MatchStrategy, /// The underlying pattern. - pat: Pattern, + pat: Glob, /// The pattern, as a compiled regex. re: Regex, } #[cfg(test)] -impl PatternStrategic { +impl GlobStrategic { /// Tests whether the given path matches this pattern or not. - pub fn is_match>(&self, path: P) -> bool { - use pathutil::file_name_ext; + fn is_match>(&self, path: P) -> bool { + self.is_match_candidate(&Candidate::new(path.as_ref())) + } - let cow_path = path_bytes(path.as_ref()); - let byte_path = &*cow_path; + /// Tests whether the given path matches this pattern or not. + fn is_match_candidate(&self, candidate: &Candidate) -> bool { + let byte_path = &*candidate.path; match self.strategy { MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path, MatchStrategy::BasenameLiteral(ref lit) => { - let lit = OsStr::new(lit); - path.as_ref().file_name().map(|n| n == lit).unwrap_or(false) + lit.as_bytes() == &*candidate.basename } MatchStrategy::Extension(ref ext) => { - path.as_ref().file_name() - .and_then(file_name_ext) - .map(|got| got == ext) - .unwrap_or(false) + candidate.ext == ext } MatchStrategy::Prefix(ref pre) => { starts_with(pre.as_bytes(), byte_path) @@ -150,10 +152,7 @@ impl PatternStrategic { ends_with(suffix.as_bytes(), byte_path) } MatchStrategy::RequiredExtension(ref ext) => { - path.as_ref().file_name() - .and_then(file_name_ext) - .map(|got| got == ext && self.re.is_match(byte_path)) - .unwrap_or(false) + candidate.ext == ext && self.re.is_match(byte_path) } MatchStrategy::Regex => self.re.is_match(byte_path), } @@ -167,15 +166,15 @@ impl PatternStrategic { /// /// The lifetime `'a` refers to the lifetime of the pattern string. #[derive(Clone, Debug)] -pub struct PatternBuilder<'a> { +pub struct GlobBuilder<'a> { /// The glob pattern to compile. glob: &'a str, /// Options for the pattern. - opts: PatternOptions, + opts: GlobOptions, } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -struct PatternOptions { +struct GlobOptions { /// Whether to match case insensitively. case_insensitive: bool, /// Whether to require a literal separator to match a separator in a file @@ -210,17 +209,17 @@ enum Token { Alternates(Vec), } -impl Pattern { +impl Glob { /// Builds a new pattern with default options. - pub fn new(glob: &str) -> Result { - PatternBuilder::new(glob).build() + pub fn new(glob: &str) -> Result { + GlobBuilder::new(glob).build() } /// Returns a matcher for this pattern. - pub fn compile_matcher(&self) -> PatternMatcher { + pub fn compile_matcher(&self) -> GlobMatcher { let re = new_regex(&self.re) .expect("regex compilation shouldn't fail"); - PatternMatcher { + GlobMatcher { pat: self.clone(), re: re, } @@ -230,13 +229,13 @@ impl Pattern { /// /// This isn't exposed because it's not clear whether it's actually /// faster than just running a regex for a *single* pattern. If it - /// is faster, then PatternMatcher should do it automatically. + /// is faster, then GlobMatcher should do it automatically. #[cfg(test)] - fn compile_strategic_matcher(&self) -> PatternStrategic { + fn compile_strategic_matcher(&self) -> GlobStrategic { let strategy = MatchStrategy::new(self); let re = new_regex(&self.re) .expect("regex compilation shouldn't fail"); - PatternStrategic { + GlobStrategic { strategy: strategy, pat: self.clone(), re: re, @@ -253,30 +252,11 @@ impl Pattern { &self.re } - /// Returns true if and only if this pattern only inspects the basename - /// of a path. - pub fn is_only_basename(&self) -> bool { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return false, - } - for t in &self.tokens[1..] { - match *t { - Token::Literal(c) if c == '/' || c == '\\' => return false, - Token::RecursivePrefix - | Token::RecursiveSuffix - | Token::RecursiveZeroOrMore => return false, - _ => {} - } - } - true - } - /// Returns the pattern as a literal if and only if the pattern must match /// an entire path exactly. /// /// The basic format of these patterns is `{literal}`. - pub fn literal(&self) -> Option { + fn literal(&self) -> Option { if self.opts.case_insensitive { return None; } @@ -301,7 +281,7 @@ impl Pattern { /// std::path::Path::extension returns. Namely, this extension includes /// the '.'. Also, paths like `.rs` are considered to have an extension /// of `.rs`. - pub fn ext(&self) -> Option { + fn ext(&self) -> Option { if self.opts.case_insensitive { return None; } @@ -343,7 +323,7 @@ impl Pattern { /// This is like `ext`, but returns an extension even if it isn't sufficent /// to imply a match. Namely, if an extension is returned, then it is /// necessary but not sufficient for a match. - pub fn required_ext(&self) -> Option { + fn required_ext(&self) -> Option { if self.opts.case_insensitive { return None; } @@ -372,7 +352,7 @@ impl Pattern { /// Returns a literal prefix of this pattern if the entire pattern matches /// if the literal prefix matches. - pub fn prefix(&self) -> Option { + fn prefix(&self) -> Option { if self.opts.case_insensitive { return None; } @@ -417,7 +397,7 @@ impl Pattern { /// /// When this returns true, the suffix literal is guaranteed to start with /// a `/`. - pub fn suffix(&self) -> Option<(String, bool)> { + fn suffix(&self) -> Option<(String, bool)> { if self.opts.case_insensitive { return None; } @@ -520,16 +500,7 @@ impl Pattern { /// /// The basic format of these patterns is `**/{literal}`, where `{literal}` /// does not contain a path separator. - pub fn basename_literal(&self) -> Option { - self.base_literal() - } - - /// Returns the pattern as a literal if and only if the pattern exclusiely - /// matches the basename of a file path *and* is a literal. - /// - /// The basic format of these patterns is `**/{literal}`, where `{literal}` - /// does not contain a path separator. - pub fn base_literal(&self) -> Option { + fn basename_literal(&self) -> Option { let tokens = match self.basename_tokens() { None => return None, Some(tokens) => tokens, @@ -543,102 +514,21 @@ impl Pattern { } Some(lit) } - - /// Returns a literal prefix of this pattern if and only if the entire - /// pattern matches if the literal prefix matches. - pub fn literal_prefix(&self) -> Option { - match self.tokens.last() { - Some(&Token::ZeroOrMore) => {} - _ => return None, - } - let mut lit = String::new(); - for t in &self.tokens[0..self.tokens.len()-1] { - match *t { - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } - - /// Returns a literal suffix of this pattern if and only if the entire - /// pattern matches if the literal suffix matches. - pub fn literal_suffix(&self) -> Option { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return None, - } - let start = - match self.tokens.get(1) { - Some(&Token::ZeroOrMore) => 2, - _ => 1, - }; - let mut lit = String::new(); - for t in &self.tokens[start..] { - match *t { - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } - - /// Returns a basename literal prefix of this pattern. - pub fn base_literal_prefix(&self) -> Option { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return None, - } - match self.tokens.last() { - Some(&Token::ZeroOrMore) => {} - _ => return None, - } - let mut lit = String::new(); - for t in &self.tokens[1..self.tokens.len()-1] { - match *t { - Token::Literal(c) if c == '/' || c == '\\' => return None, - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } - - /// Returns a basename literal suffix of this pattern. - pub fn base_literal_suffix(&self) -> Option { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return None, - } - match self.tokens.get(1) { - Some(&Token::ZeroOrMore) => {} - _ => return None, - } - let mut lit = String::new(); - for t in &self.tokens[2..] { - match *t { - Token::Literal(c) if c == '/' || c == '\\' => return None, - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } } -impl<'a> PatternBuilder<'a> { +impl<'a> GlobBuilder<'a> { /// Create a new builder for the pattern given. /// /// The pattern is not compiled until `build` is called. - pub fn new(glob: &'a str) -> PatternBuilder<'a> { - PatternBuilder { + pub fn new(glob: &'a str) -> GlobBuilder<'a> { + GlobBuilder { glob: glob, - opts: PatternOptions::default(), + opts: GlobOptions::default(), } } /// Parses and builds the pattern. - pub fn build(&self) -> Result { + pub fn build(&self) -> Result { let mut p = Parser { stack: vec![Tokens::default()], chars: self.glob.chars().peekable(), @@ -652,7 +542,7 @@ impl<'a> PatternBuilder<'a> { Err(Error::UnclosedAlternates) } else { let tokens = p.stack.pop().unwrap(); - Ok(Pattern { + Ok(Glob { glob: self.glob.to_string(), re: tokens.to_regex_with(&self.opts), opts: self.opts, @@ -664,13 +554,13 @@ impl<'a> PatternBuilder<'a> { /// Toggle whether the pattern matches case insensitively or not. /// /// This is disabled by default. - pub fn case_insensitive(&mut self, yes: bool) -> &mut PatternBuilder<'a> { + pub fn case_insensitive(&mut self, yes: bool) -> &mut GlobBuilder<'a> { self.opts.case_insensitive = yes; self } /// Toggle whether a literal `/` is required to match a path separator. - pub fn literal_separator(&mut self, yes: bool) -> &mut PatternBuilder<'a> { + pub fn literal_separator(&mut self, yes: bool) -> &mut GlobBuilder<'a> { self.opts.literal_separator = yes; self } @@ -680,7 +570,7 @@ impl Tokens { /// Convert this pattern to a string that is guaranteed to be a valid /// regular expression and will represent the matching semantics of this /// glob pattern and the options given. - fn to_regex_with(&self, options: &PatternOptions) -> String { + fn to_regex_with(&self, options: &GlobOptions) -> String { let mut re = String::new(); re.push_str("(?-u)"); if options.case_insensitive { @@ -699,43 +589,39 @@ impl Tokens { re } - fn tokens_to_regex( &self, - options: &PatternOptions, + options: &GlobOptions, tokens: &[Token], re: &mut String, ) { - let seps = &*FILE_SEPARATORS; - for tok in tokens { match *tok { Token::Literal(c) => { - re.push_str(®ex::quote(&c.to_string())); + re.push_str(&char_to_escaped_literal(c)); } Token::Any => { if options.literal_separator { - re.push_str(&format!("[^{}]", seps)); + re.push_str("[^/]"); } else { re.push_str("."); } } Token::ZeroOrMore => { if options.literal_separator { - re.push_str(&format!("[^{}]*", seps)); + re.push_str("[^/]*"); } else { re.push_str(".*"); } } Token::RecursivePrefix => { - re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps)); + re.push_str("(?:/?|.*/)"); } Token::RecursiveSuffix => { - re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps)); + re.push_str("(?:/?|/.*)"); } Token::RecursiveZeroOrMore => { - re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])", - sep=seps)); + re.push_str("(?:/|/.*/)"); } Token::Class { negated, ref ranges } => { re.push('['); @@ -745,11 +631,11 @@ impl Tokens { for r in ranges { if r.0 == r.1 { // Not strictly necessary, but nicer to look at. - re.push_str(®ex::quote(&r.0.to_string())); + re.push_str(&char_to_escaped_literal(r.0)); } else { - re.push_str(®ex::quote(&r.0.to_string())); + re.push_str(&char_to_escaped_literal(r.0)); re.push('-'); - re.push_str(®ex::quote(&r.1.to_string())); + re.push_str(&char_to_escaped_literal(r.1)); } } re.push(']'); @@ -768,6 +654,26 @@ impl Tokens { } } +/// Convert a Unicode scalar value to an escaped string suitable for use as +/// a literal in a non-Unicode regex. +fn char_to_escaped_literal(c: char) -> String { + bytes_to_escaped_literal(&c.to_string().into_bytes()) +} + +/// Converts an arbitrary sequence of bytes to a UTF-8 string. All non-ASCII +/// code units are converted to their escaped form. +fn bytes_to_escaped_literal(bs: &[u8]) -> String { + let mut s = String::with_capacity(bs.len()); + for &b in bs { + if b <= 0x7F { + s.push_str(®ex::quote(&(b as char).to_string())); + } else { + s.push_str(&format!("\\x{:02x}", b)); + } + } + s +} + struct Parser<'a> { stack: Vec, chars: iter::Peekable>, @@ -785,7 +691,14 @@ impl<'a> Parser<'a> { '{' => try!(self.push_alternate()), '}' => try!(self.pop_alternate()), ',' => try!(self.parse_comma()), - c => try!(self.push_token(Token::Literal(c))), + c => { + if is_separator(c) { + // Normalize all patterns to use / as a separator. + try!(self.push_token(Token::Literal('/'))) + } else { + try!(self.push_token(Token::Literal(c))) + } + } } } Ok(()) @@ -848,13 +761,13 @@ impl<'a> Parser<'a> { if !try!(self.have_tokens()) { try!(self.push_token(Token::RecursivePrefix)); let next = self.bump(); - if !next.is_none() && next != Some('/') { + if !next.map(is_separator).unwrap_or(true) { return Err(Error::InvalidRecursive); } return Ok(()); } try!(self.pop_token()); - if prev != Some('/') { + if !prev.map(is_separator).unwrap_or(false) { if self.stack.len() <= 1 || (prev != Some(',') && prev != Some('{')) { return Err(Error::InvalidRecursive); @@ -868,8 +781,8 @@ impl<'a> Parser<'a> { Some(&',') | Some(&'}') if self.stack.len() >= 2 => { self.push_token(Token::RecursiveSuffix) } - Some(&'/') => { - assert!(self.bump() == Some('/')); + Some(&c) if is_separator(c) => { + assert!(self.bump().map(is_separator).unwrap_or(false)); self.push_token(Token::RecursiveZeroOrMore) } _ => Err(Error::InvalidRecursive), @@ -973,8 +886,8 @@ fn ends_with(needle: &[u8], haystack: &[u8]) -> bool { mod tests { use std::ffi::{OsStr, OsString}; - use {SetBuilder, Error}; - use super::{Pattern, PatternBuilder, Token}; + use {GlobSetBuilder, Error}; + use super::{Glob, GlobBuilder, Token}; use super::Token::*; #[derive(Clone, Copy, Debug, Default)] @@ -987,7 +900,7 @@ mod tests { ($name:ident, $pat:expr, $tokens:expr) => { #[test] fn $name() { - let pat = Pattern::new($pat).unwrap(); + let pat = Glob::new($pat).unwrap(); assert_eq!($tokens, pat.tokens.0); } } @@ -997,7 +910,7 @@ mod tests { ($name:ident, $pat:expr, $err:expr) => { #[test] fn $name() { - let err = Pattern::new($pat).unwrap_err(); + let err = Glob::new($pat).unwrap_err(); assert_eq!($err, err); } } @@ -1010,7 +923,7 @@ mod tests { ($name:ident, $pat:expr, $re:expr, $options:expr) => { #[test] fn $name() { - let pat = PatternBuilder::new($pat) + let pat = GlobBuilder::new($pat) .case_insensitive($options.casei) .literal_separator($options.litsep) .build() @@ -1027,14 +940,14 @@ mod tests { ($name:ident, $pat:expr, $path:expr, $options:expr) => { #[test] fn $name() { - let pat = PatternBuilder::new($pat) + let pat = GlobBuilder::new($pat) .case_insensitive($options.casei) .literal_separator($options.litsep) .build() .unwrap(); let matcher = pat.compile_matcher(); let strategic = pat.compile_strategic_matcher(); - let set = SetBuilder::new().add(pat).build().unwrap(); + let set = GlobSetBuilder::new().add(pat).build().unwrap(); assert!(matcher.is_match($path)); assert!(strategic.is_match($path)); assert!(set.is_match($path)); @@ -1049,14 +962,14 @@ mod tests { ($name:ident, $pat:expr, $path:expr, $options:expr) => { #[test] fn $name() { - let pat = PatternBuilder::new($pat) + let pat = GlobBuilder::new($pat) .case_insensitive($options.casei) .literal_separator($options.litsep) .build() .unwrap(); let matcher = pat.compile_matcher(); let strategic = pat.compile_strategic_matcher(); - let set = SetBuilder::new().add(pat).build().unwrap(); + let set = GlobSetBuilder::new().add(pat).build().unwrap(); assert!(!matcher.is_match($path)); assert!(!strategic.is_match($path)); assert!(!set.is_match($path)); @@ -1146,8 +1059,8 @@ mod tests { toregex!(re_casei, "a", "(?i)^a$", &CASEI); - toregex!(re_slash1, "?", r"^[^/\\]$", SLASHLIT); - toregex!(re_slash2, "*", r"^[^/\\]*$", SLASHLIT); + toregex!(re_slash1, "?", r"^[^/]$", SLASHLIT); + toregex!(re_slash2, "*", r"^[^/]*$", SLASHLIT); toregex!(re1, "a", "^a$"); toregex!(re2, "?", "^.$"); @@ -1160,6 +1073,7 @@ mod tests { toregex!(re9, "[+]", r"^[\+]$"); toregex!(re10, "+", r"^\+$"); toregex!(re11, "**", r"^.*$"); + toregex!(re12, "☃", r"^\xe2\x98\x83$"); matches!(match1, "a", "a"); matches!(match2, "a*b", "a_b"); @@ -1170,6 +1084,7 @@ mod tests { matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); matches!(match8, "a*b[xyz]c*d", "abxcdbxcddd"); matches!(match9, "*.rs", ".rs"); + matches!(match10, "☃", "☃"); matches!(matchrec1, "some/**/needle.txt", "some/needle.txt"); matches!(matchrec2, "some/**/needle.txt", "some/one/needle.txt"); @@ -1239,10 +1154,16 @@ mod tests { matches!(matchalt13, "{*.foo,*.bar,*.wat}", "test.wat"); matches!(matchslash1, "abc/def", "abc/def", SLASHLIT); + #[cfg(unix)] nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT); - nmatches!(matchslash2_win, "abc?def", "abc\\def", SLASHLIT); + #[cfg(not(unix))] + nmatches!(matchslash2, "abc?def", "abc\\def", SLASHLIT); nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT); matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs + #[cfg(unix)] + nmatches!(matchslash5, "abc\\def", "abc/def", SLASHLIT); + #[cfg(not(unix))] + matches!(matchslash5, "abc\\def", "abc/def", SLASHLIT); nmatches!(matchnot1, "a*b*c", "abcd"); nmatches!(matchnot2, "abc*abc*abc", "abcabcabcabcabcabcabca"); @@ -1281,7 +1202,7 @@ mod tests { ($which:ident, $name:ident, $pat:expr, $expect:expr, $opts:expr) => { #[test] fn $name() { - let pat = PatternBuilder::new($pat) + let pat = GlobBuilder::new($pat) .case_insensitive($opts.casei) .literal_separator($opts.litsep) .build().unwrap(); diff --git a/globset/src/lib.rs b/globset/src/lib.rs index f608a74a8..056118a39 100644 --- a/globset/src/lib.rs +++ b/globset/src/lib.rs @@ -1,16 +1,101 @@ /*! -The glob module provides standard shell globbing, but is specifically -implemented by converting glob syntax to regular expressions. The reasoning is -two fold: - -1. The regex library is *really* fast. Regaining performance in a distinct - implementation of globbing is non-trivial. -2. Most crucially, a `RegexSet` can be used to match many globs simultaneously. - -This module is written with some amount of intention of eventually splitting it -out into its own separate crate, but I didn't quite have the energy for all -that rigamorole when I wrote this. In particular, it could be fast/good enough -to make its way into `glob` proper. +The globset crate provides cross platform single glob and glob set matching. + +Glob set matching is the process of matching one or more glob patterns against +a single candidate path simultaneously, and returning all of the globs that +matched. For example, given this set of globs: + +```ignore +*.rs +src/lib.rs +src/**/foo.rs +``` + +and a path `src/bar/baz/foo.rs`, then the set would report the first and third +globs as matching. + +Single glob matching is also provided and is done by converting globs to + +# Example: one glob + +This example shows how to match a single glob against a single file path. + +``` +# fn example() -> Result<(), globset::Error> { +use globset::Glob; + +let glob = try!(Glob::new("*.rs")).compile_matcher(); + +assert!(glob.is_match("foo.rs")); +assert!(glob.is_match("foo/bar.rs")); +assert!(!glob.is_match("Cargo.toml")); +# Ok(()) } example().unwrap(); +``` + +# Example: configuring a glob matcher + +This example shows how to use a `GlobBuilder` to configure aspects of match +semantics. In this example, we prevent wildcards from matching path separators. + +``` +# fn example() -> Result<(), globset::Error> { +use globset::GlobBuilder; + +let glob = try!(GlobBuilder::new("*.rs") + .literal_separator(true).build()).compile_matcher(); + +assert!(glob.is_match("foo.rs")); +assert!(!glob.is_match("foo/bar.rs")); // no longer matches +assert!(!glob.is_match("Cargo.toml")); +# Ok(()) } example().unwrap(); +``` + +# Example: match multiple globs at once + +This example shows how to match multiple glob patterns at once. + +``` +# fn example() -> Result<(), globset::Error> { +use globset::{Glob, GlobSetBuilder}; + +let mut builder = GlobSetBuilder::new(); +// A GlobBuilder can be used to configure each glob's match semantics +// independently. +builder.add(try!(Glob::new("*.rs"))); +builder.add(try!(Glob::new("src/lib.rs"))); +builder.add(try!(Glob::new("src/**/foo.rs"))); +let set = try!(builder.build()); + +assert_eq!(set.matches("src/bar/baz/foo.rs"), vec![0, 2]); +# Ok(()) } example().unwrap(); +``` + +# Syntax + +Standard Unix-style glob syntax is supported: + +* `?` matches any single character. (If the `literal_separator` option is + enabled, then `?` can never match a path separator.) +* `*` matches zero or more characters. (If the `literal_separator` option is + enabled, then `*` can never match a path separator.) +* `**` recursively matches directories but are only legal in three situations. + First, if the glob starts with \*\*/, then it matches + all directories. For example, \*\*/foo matches `foo` + and `bar/foo` but not `foo/bar`. Secondly, if the glob ends with + /\*\*, then it matches all sub-entries. For example, + foo/\*\* matches `foo/a` and `foo/a/b`, but not `foo`. + Thirdly, if the glob contains /\*\*/ anywhere within + the pattern, then it matches zero or more directories. Using `**` anywhere + else is illegal (N.B. the glob `**` is allowed and means "match everything"). +* `{a,b}` matches `a` or `b` where `a` and `b` are arbitrary glob patterns. + (N.B. Nesting `{...}` is not currently allowed.) +* `[ab]` matches `a` or `b` where `a` and `b` are characters. Use + `[!ab]` to match any character except for `a` and `b`. +* Metacharacters such as `*` and `?` can be escaped with character class + notation. e.g., `[*]` matches `*`. + +A `GlobBuilder` can be used to prevent wildcards from matching path separators, +or to enable case insensitive matching. */ #![deny(missing_docs)] @@ -36,12 +121,14 @@ use std::str; use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; use regex::bytes::{Regex, RegexBuilder, RegexSet}; -use pathutil::{file_name, file_name_ext, os_str_bytes, path_bytes}; -use pattern::MatchStrategy; -pub use pattern::{Pattern, PatternBuilder, PatternMatcher}; +use pathutil::{ + file_name, file_name_ext, normalize_path, os_str_bytes, path_bytes, +}; +use glob::MatchStrategy; +pub use glob::{Glob, GlobBuilder, GlobMatcher}; +mod glob; mod pathutil; -mod pattern; macro_rules! eprintln { ($($tt:tt)*) => {{ @@ -50,10 +137,6 @@ macro_rules! eprintln { }} } -lazy_static! { - static ref FILE_SEPARATORS: String = regex::quote(r"/\"); -} - /// Represents an error that can occur when parsing a glob pattern. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Error { @@ -139,19 +222,26 @@ fn new_regex_set(pats: I) -> Result type Fnv = hash::BuildHasherDefault; -/// Set represents a group of globs that can be matched together in a single -/// pass. +/// GlobSet represents a group of globs that can be matched together in a +/// single pass. #[derive(Clone, Debug)] -pub struct Set { - strats: Vec, +pub struct GlobSet { + strats: Vec, } -impl Set { +impl GlobSet { + /// Returns true if any glob in this set matches the path given. + pub fn is_match>(&self, path: P) -> bool { + self.is_match_candidate(&Candidate::new(path.as_ref())) + } + /// Returns true if any glob in this set matches the path given. - pub fn is_match>(&self, path: T) -> bool { - let candidate = Candidate::new(path.as_ref()); + /// + /// This takes a Candidate as input, which can be used to amortize the + /// cost of preparing a path for matching. + pub fn is_match_candidate(&self, path: &Candidate) -> bool { for strat in &self.strats { - if strat.is_match(&candidate) { + if strat.is_match(path) { return true; } } @@ -160,30 +250,44 @@ impl Set { /// Returns the sequence number of every glob pattern that matches the /// given path. - #[allow(dead_code)] - pub fn matches>(&self, path: T) -> Vec { + /// + /// This takes a Candidate as input, which can be used to amortize the + /// cost of preparing a path for matching. + pub fn matches>(&self, path: P) -> Vec { + self.matches_candidate(&Candidate::new(path.as_ref())) + } + + /// Returns the sequence number of every glob pattern that matches the + /// given path. + /// + /// This takes a Candidate as input, which can be used to amortize the + /// cost of preparing a path for matching. + pub fn matches_candidate(&self, path: &Candidate) -> Vec { let mut into = vec![]; - self.matches_into(path, &mut into); + self.matches_candidate_into(path, &mut into); into } /// Adds the sequence number of every glob pattern that matches the given /// path to the vec given. - pub fn matches_into>( + /// + /// `into` is is cleared before matching begins, and contains the set of + /// sequence numbers (in ascending order) after matching ends. If no globs + /// were matched, then `into` will be empty. + pub fn matches_candidate_into( &self, - path: T, + path: &Candidate, into: &mut Vec, ) { into.clear(); - let candidate = Candidate::new(path.as_ref()); for strat in &self.strats { - strat.matches_into(&candidate, into); + strat.matches_into(path, into); } into.sort(); into.dedup(); } - fn new(pats: &[Pattern]) -> Result { + fn new(pats: &[Glob]) -> Result { let mut lits = LiteralStrategy::new(); let mut base_lits = BasenameLiteralStrategy::new(); let mut exts = ExtensionStrategy::new(); @@ -225,63 +329,70 @@ impl Set { lits.0.len(), base_lits.0.len(), exts.0.len(), prefixes.literals.len(), suffixes.literals.len(), required_exts.0.len(), regexes.literals.len()); - Ok(Set { + Ok(GlobSet { strats: vec![ - SetMatchStrategy::Extension(exts), - SetMatchStrategy::BasenameLiteral(base_lits), - SetMatchStrategy::Literal(lits), - SetMatchStrategy::Suffix(suffixes.suffix()), - SetMatchStrategy::Prefix(prefixes.prefix()), - SetMatchStrategy::RequiredExtension( + GlobSetMatchStrategy::Extension(exts), + GlobSetMatchStrategy::BasenameLiteral(base_lits), + GlobSetMatchStrategy::Literal(lits), + GlobSetMatchStrategy::Suffix(suffixes.suffix()), + GlobSetMatchStrategy::Prefix(prefixes.prefix()), + GlobSetMatchStrategy::RequiredExtension( try!(required_exts.build())), - SetMatchStrategy::Regex(try!(regexes.regex_set())), + GlobSetMatchStrategy::Regex(try!(regexes.regex_set())), ], }) } } -/// SetBuilder builds a group of patterns that can be used to simultaneously -/// match a file path. -pub struct SetBuilder { - pats: Vec, +/// GlobSetBuilder builds a group of patterns that can be used to +/// simultaneously match a file path. +pub struct GlobSetBuilder { + pats: Vec, } -impl SetBuilder { - /// Create a new SetBuilder. A SetBuilder can be used to add new patterns. - /// Once all patterns have been added, `build` should be called to produce - /// a `Set`, which can then be used for matching. - pub fn new() -> SetBuilder { - SetBuilder { pats: vec![] } +impl GlobSetBuilder { + /// Create a new GlobSetBuilder. A GlobSetBuilder can be used to add new + /// patterns. Once all patterns have been added, `build` should be called + /// to produce a `GlobSet`, which can then be used for matching. + pub fn new() -> GlobSetBuilder { + GlobSetBuilder { pats: vec![] } } /// Builds a new matcher from all of the glob patterns added so far. /// /// Once a matcher is built, no new patterns can be added to it. - pub fn build(&self) -> Result { - Set::new(&self.pats) + pub fn build(&self) -> Result { + GlobSet::new(&self.pats) } /// Add a new pattern to this set. #[allow(dead_code)] - pub fn add(&mut self, pat: Pattern) -> &mut SetBuilder { + pub fn add(&mut self, pat: Glob) -> &mut GlobSetBuilder { self.pats.push(pat); self } } +/// A candidate path for matching. +/// +/// All glob matching in this crate operates on `Candidate` values. +/// Constructing candidates has a very small cost associated with it, so +/// callers may find it beneficial to amortize that cost when matching a single +/// path against multiple globs or sets of globs. #[derive(Clone, Debug)] -struct Candidate<'a> { +pub struct Candidate<'a> { path: Cow<'a, [u8]>, basename: Cow<'a, [u8]>, ext: &'a OsStr, } impl<'a> Candidate<'a> { - fn new + ?Sized>(path: &'a P) -> Candidate<'a> { + /// Create a new candidate for matching from the given path. + pub fn new + ?Sized>(path: &'a P) -> Candidate<'a> { let path = path.as_ref(); let basename = file_name(path).unwrap_or(OsStr::new("")); Candidate { - path: path_bytes(path), + path: normalize_path(path_bytes(path)), basename: os_str_bytes(basename), ext: file_name_ext(basename).unwrap_or(OsStr::new("")), } @@ -305,7 +416,7 @@ impl<'a> Candidate<'a> { } #[derive(Clone, Debug)] -enum SetMatchStrategy { +enum GlobSetMatchStrategy { Literal(LiteralStrategy), BasenameLiteral(BasenameLiteralStrategy), Extension(ExtensionStrategy), @@ -315,9 +426,9 @@ enum SetMatchStrategy { Regex(RegexSetStrategy), } -impl SetMatchStrategy { +impl GlobSetMatchStrategy { fn is_match(&self, candidate: &Candidate) -> bool { - use self::SetMatchStrategy::*; + use self::GlobSetMatchStrategy::*; match *self { Literal(ref s) => s.is_match(candidate), BasenameLiteral(ref s) => s.is_match(candidate), @@ -330,7 +441,7 @@ impl SetMatchStrategy { } fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { - use self::SetMatchStrategy::*; + use self::GlobSetMatchStrategy::*; match *self { Literal(ref s) => s.matches_into(candidate, matches), BasenameLiteral(ref s) => s.matches_into(candidate, matches), @@ -616,29 +727,23 @@ impl RequiredExtensionStrategyBuilder { #[cfg(test)] mod tests { - use super::{Set, SetBuilder}; - use pattern::Pattern; + use super::GlobSetBuilder; + use glob::Glob; #[test] fn set_works() { - let mut builder = SetBuilder::new(); - builder.add(Pattern::new("src/**/*.rs").unwrap()); - builder.add(Pattern::new("*.c").unwrap()); - builder.add(Pattern::new("src/lib.rs").unwrap()); + let mut builder = GlobSetBuilder::new(); + builder.add(Glob::new("src/**/*.rs").unwrap()); + builder.add(Glob::new("*.c").unwrap()); + builder.add(Glob::new("src/lib.rs").unwrap()); let set = builder.build().unwrap(); - fn is_match(set: &Set, s: &str) -> bool { - let mut matches = vec![]; - set.matches_into(s, &mut matches); - !matches.is_empty() - } - - assert!(is_match(&set, "foo.c")); - assert!(is_match(&set, "src/foo.c")); - assert!(!is_match(&set, "foo.rs")); - assert!(!is_match(&set, "tests/foo.rs")); - assert!(is_match(&set, "src/foo.rs")); - assert!(is_match(&set, "src/grep/src/main.rs")); + assert!(set.is_match("foo.c")); + assert!(set.is_match("src/foo.c")); + assert!(!set.is_match("foo.rs")); + assert!(!set.is_match("tests/foo.rs")); + assert!(set.is_match("src/foo.rs")); + assert!(set.is_match("src/grep/src/main.rs")); let matches = set.matches("src/lib.rs"); assert_eq!(2, matches.len()); diff --git a/globset/src/pathutil.rs b/globset/src/pathutil.rs index 3e89f7bbc..15a3283bd 100644 --- a/globset/src/pathutil.rs +++ b/globset/src/pathutil.rs @@ -101,20 +101,45 @@ pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { /// necessary. #[cfg(not(unix))] pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { - // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even - // if we could get at the raw bytes, they wouldn't be useful. We *must* - // convert to UTF-8 before doing path matching. Unfortunate, but necessary. + // TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset + // of UTF-8, so even if we could get at the raw bytes, they wouldn't + // be useful. We *must* convert to UTF-8 before doing path matching. + // Unfortunate, but necessary. match s.to_string_lossy() { Cow::Owned(s) => Cow::Owned(s.into_bytes()), Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), } } +/// Normalizes a path to use `/` as a separator everywhere, even on platforms +/// that recognize other characters as separators. +#[cfg(unix)] +pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> { + // UNIX only uses /, so we're good. + path +} + +/// Normalizes a path to use `/` as a separator everywhere, even on platforms +/// that recognize other characters as separators. +#[cfg(not(unix))] +pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> { + use std::path::is_separator; + + for i in 0..path.len() { + if path[i] == b'/' || !is_separator(path[i] as char) { + continue; + } + path.to_mut()[i] = b'/'; + } + path +} + #[cfg(test)] mod tests { + use std::borrow::Cow; use std::ffi::OsStr; - use super::file_name_ext; + use super::{file_name_ext, normalize_path}; macro_rules! ext { ($name:ident, $file_name:expr, $ext:expr) => { @@ -131,4 +156,25 @@ mod tests { ext!(ext3, "..rs", Some(".rs")); ext!(ext4, "", None::<&str>); ext!(ext5, "foo", None::<&str>); + + macro_rules! normalize { + ($name:ident, $path:expr, $expected:expr) => { + #[test] + fn $name() { + let got = normalize_path(Cow::Owned($path.to_vec())); + assert_eq!($expected.to_vec(), got.into_owned()); + } + }; + } + + normalize!(normal1, b"foo", b"foo"); + normalize!(normal2, b"foo/bar", b"foo/bar"); + #[cfg(unix)] + normalize!(normal3, b"foo\\bar", b"foo\\bar"); + #[cfg(not(unix))] + normalize!(normal3, b"foo\\bar", b"foo/bar"); + #[cfg(unix)] + normalize!(normal4, b"foo\\bar/baz", b"foo\\bar/baz"); + #[cfg(not(unix))] + normalize!(normal4, b"foo\\bar/baz", b"foo/bar/baz"); } diff --git a/src/gitignore.rs b/src/gitignore.rs index 5e07531d9..4c7e8203e 100644 --- a/src/gitignore.rs +++ b/src/gitignore.rs @@ -28,7 +28,7 @@ use std::fs::File; use std::io::{self, BufRead}; use std::path::{Path, PathBuf}; -use globset::{self, PatternBuilder, Set, SetBuilder}; +use globset::{self, Candidate, GlobBuilder, GlobSet, GlobSetBuilder}; use regex; use pathutil::{is_file_name, strip_prefix}; @@ -82,7 +82,7 @@ impl From for Error { /// Gitignore is a matcher for the glob patterns in a single gitignore file. #[derive(Clone, Debug)] pub struct Gitignore { - set: Set, + set: GlobSet, root: PathBuf, patterns: Vec, num_ignores: u64, @@ -140,7 +140,8 @@ impl Gitignore { }; MATCHES.with(|matches| { let mut matches = matches.borrow_mut(); - self.set.matches_into(path, &mut *matches); + let candidate = Candidate::new(path); + self.set.matches_candidate_into(&candidate, &mut *matches); for &i in matches.iter().rev() { let pat = &self.patterns[i]; if !pat.only_dir || is_dir { @@ -207,7 +208,7 @@ impl<'a> Match<'a> { /// GitignoreBuilder constructs a matcher for a single set of globs from a /// .gitignore file. pub struct GitignoreBuilder { - builder: SetBuilder, + builder: GlobSetBuilder, root: PathBuf, patterns: Vec, } @@ -237,7 +238,7 @@ impl GitignoreBuilder { pub fn new>(root: P) -> GitignoreBuilder { let root = strip_prefix("./", root.as_ref()).unwrap_or(root.as_ref()); GitignoreBuilder { - builder: SetBuilder::new(), + builder: GlobSetBuilder::new(), root: root.to_path_buf(), patterns: vec![], } @@ -262,8 +263,18 @@ impl GitignoreBuilder { pub fn add_path>(&mut self, path: P) -> Result<(), Error> { let rdr = io::BufReader::new(try!(File::open(&path))); debug!("gitignore: {}", path.as_ref().display()); - for line in rdr.lines() { - try!(self.add(&path, &try!(line))); + for (i, line) in rdr.lines().enumerate() { + let line = match line { + Ok(line) => line, + Err(err) => { + debug!("error reading line {} in {}: {}", + i, path.as_ref().display(), err); + continue; + } + }; + if let Err(err) = self.add(&path, &line) { + debug!("error adding gitignore pattern: '{}': {}", line, err); + } } Ok(()) } @@ -349,7 +360,7 @@ impl GitignoreBuilder { pat.pat = format!("{}/*", pat.pat); } let parsed = try!( - PatternBuilder::new(&pat.pat) + GlobBuilder::new(&pat.pat) .literal_separator(literal_separator) .build()); self.builder.add(parsed); diff --git a/src/types.rs b/src/types.rs index c084500ea..695171241 100644 --- a/src/types.rs +++ b/src/types.rs @@ -11,7 +11,7 @@ use std::path::Path; use regex; use gitignore::{Match, Pattern}; -use globset::{self, PatternBuilder, Set, SetBuilder}; +use globset::{self, GlobBuilder, GlobSet, GlobSetBuilder}; const TYPE_EXTENSIONS: &'static [(&'static str, &'static [&'static str])] = &[ ("asm", &["*.asm", "*.s", "*.S"]), @@ -164,8 +164,8 @@ impl FileTypeDef { #[derive(Clone, Debug)] pub struct Types { defs: Vec, - selected: Option, - negated: Option, + selected: Option, + negated: Option, has_selected: bool, unmatched_pat: Pattern, } @@ -178,8 +178,8 @@ impl Types { /// If has_selected is true, then at least one file type was selected. /// Therefore, any non-matches should be ignored. fn new( - selected: Option, - negated: Option, + selected: Option, + negated: Option, has_selected: bool, defs: Vec, ) -> Types { @@ -272,7 +272,7 @@ impl TypesBuilder { if self.selected.is_empty() { None } else { - let mut bset = SetBuilder::new(); + let mut bset = GlobSetBuilder::new(); for name in &self.selected { let globs = match self.types.get(name) { Some(globs) => globs, @@ -283,7 +283,7 @@ impl TypesBuilder { }; for glob in globs { let pat = try!( - PatternBuilder::new(glob) + GlobBuilder::new(glob) .literal_separator(true).build()); bset.add(pat); } @@ -294,7 +294,7 @@ impl TypesBuilder { if self.negated.is_empty() { None } else { - let mut bset = SetBuilder::new(); + let mut bset = GlobSetBuilder::new(); for name in &self.negated { let globs = match self.types.get(name) { Some(globs) => globs, @@ -305,7 +305,7 @@ impl TypesBuilder { }; for glob in globs { let pat = try!( - PatternBuilder::new(glob) + GlobBuilder::new(glob) .literal_separator(true).build()); bset.add(pat); } diff --git a/tests/tests.rs b/tests/tests.rs index 1308d715b..bf4d35701 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -611,17 +611,6 @@ sherlock!(unrestricted2, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| { assert_eq!(lines, expected); }); -#[cfg(not(windows))] -sherlock!(unrestricted3, "foo", ".", |wd: WorkDir, mut cmd: Command| { - wd.create("file", "foo\x00bar\nfoo\x00baz\n"); - cmd.arg("-uuu"); - - let lines: String = wd.stdout(&mut cmd); - assert_eq!(lines, "file:foo\x00bar\nfile:foo\x00baz\n"); -}); - -// On Windows, this test uses memory maps, so the NUL bytes don't get replaced. -#[cfg(windows)] sherlock!(unrestricted3, "foo", ".", |wd: WorkDir, mut cmd: Command| { wd.create("file", "foo\x00bar\nfoo\x00baz\n"); cmd.arg("-uuu"); @@ -723,6 +712,13 @@ clean!(regression_67, "test", ".", |wd: WorkDir, mut cmd: Command| { assert_eq!(lines, path("dir/bar:test\n")); }); +// See: https://github.com/BurntSushi/ripgrep/issues/87 +clean!(regression_87, "test", ".", |wd: WorkDir, mut cmd: Command| { + wd.create(".gitignore", "foo\n**no-vcs**"); + wd.create("foo", "test"); + wd.assert_err(&mut cmd); +}); + // See: https://github.com/BurntSushi/ripgrep/issues/90 clean!(regression_90, "test", ".", |wd: WorkDir, mut cmd: Command| { wd.create(".gitignore", "!.foo"); @@ -771,6 +767,40 @@ clean!(regression_105_part2, "test", ".", |wd: WorkDir, mut cmd: Command| { assert_eq!(lines, "foo:3:zztest\n"); }); +// See: https://github.com/BurntSushi/ripgrep/issues/127 +clean!(regression_127, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| { + // Set up a directory hierarchy like this: + // + // .gitignore + // foo/ + // sherlock + // watson + // + // Where `.gitignore` contains `foo/sherlock`. + // + // ripgrep should ignore 'foo/sherlock' giving us results only from + // 'foo/watson' but on Windows ripgrep will include both 'foo/sherlock' and + // 'foo/watson' in the search results. + wd.create(".gitignore", "foo/sherlock\n"); + wd.create_dir("foo"); + wd.create("foo/sherlock", hay::SHERLOCK); + wd.create("foo/watson", hay::SHERLOCK); + + let lines: String = wd.stdout(&mut cmd); + let expected = format!("\ +{path}:For the Doctor Watsons of this world, as opposed to the Sherlock +{path}:be, to a very large extent, the result of luck. Sherlock Holmes +", path=path("foo/watson")); + assert_eq!(lines, expected); +}); + +// See: https://github.com/BurntSushi/ripgrep/issues/131 +clean!(regression_131, "test", ".", |wd: WorkDir, mut cmd: Command| { + wd.create(".gitignore", "TopÑapa"); + wd.create("TopÑapa", "test"); + wd.assert_err(&mut cmd); +}); + // See: https://github.com/BurntSushi/ripgrep/issues/20 sherlock!(feature_20_no_filename, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {