From 370b3a031ccdf5b88515bd79909b1e592297738f Mon Sep 17 00:00:00 2001 From: Zack Weinberg Date: Sun, 24 Sep 2023 01:38:50 -0400 Subject: [PATCH] Permit use of (?-u) in byte-regex strategies (#336) (#337) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Permit use of (?-u) in byte-regex strategies (#336) It is desirable to be able to generate, from a regex, byte sequences that are not necessarily valid UTF-8. For example, suppose you have a parser that accepts any string generated by the regex ``` [0-9]+(\.[0-9]*)? ``` Then, in your test suite, you might want to generate strings from the complementary regular language, which you could do with the regex ``` (?s:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*) ``` However, this will still only generate valid UTF-8 strings. Maybe you are parsing directly from byte sequences read from disk, in which case you want to test the parser’s ability to reject invalid UTF-8 _as well as_ valid UTF-8 but not within the accepted language. Then you want this slight variation: ``` (?s-u:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*) ``` But this regex will be rejected by `bytes_regex`, because by default `regex_syntax::Parser` errors out on any regex that potentially matches invalid UTF-8. The application — i.e. proptest — must opt into use of such regexes. This patch makes proptest do just that, for `bytes_regex` only. There should be no change to the behavior of any existing test suite, because opting to allow use of `(?-u)` does not change the semantics of any regex that _doesn’t_ contain `(?-u)`, and any existing regex that _does_ contain `(?-u)` must be incapable of generating invalid UTF-8 for other reasons, or `regex_syntax::Parser` would be rejecting it. (For example, `(?-u:[a-z])` cannot generate invalid UTF-8.) This patch also adds a bunch of tests for `bytes_regex`, which AFAICT was not being tested at all. Some of these use the new functionality and others don’t. There is quite a bit of code duplication in the test helper functions — `do_test` and `do_test_bytes` are almost identical, as are `generate_values_matching_regex` and `generate_byte_values_matching_regex`. I am not good enough at generic metaprogramming in Rust to factor out the duplication. * [to squash] Correct for API change in regex-syntax 0.7 Commit https://github.com/rust-lang/regex/commit/706b07de3d07602ede626e98837b09945e6550b5 renamed ParserBuilder::allow_invalid_utf8 to ParserBuilder::utf8 and inverted the sense of its argument. Separate commit for review purposes; should be squashed before landing to preserve bisectability of trunk. --- proptest/src/string.rs | 111 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 103 insertions(+), 8 deletions(-) diff --git a/proptest/src/string.rs b/proptest/src/string.rs index 55666cb6..935cb21c 100644 --- a/proptest/src/string.rs +++ b/proptest/src/string.rs @@ -17,7 +17,7 @@ use core::ops::RangeInclusive; use core::u32; use regex_syntax::hir::{self, Hir, HirKind::*, Repetition}; -use regex_syntax::{Error as ParseError, Parser}; +use regex_syntax::{Error as ParseError, ParserBuilder}; use crate::bool; use crate::char; @@ -144,7 +144,8 @@ impl StrategyFromRegex for Vec { /// If you don't need error handling and aren't limited by setup time, it is /// also possible to directly use a `&str` as a strategy with the same effect. pub fn string_regex(regex: &str) -> ParseResult { - string_regex_parsed(®ex_to_hir(regex)?) + let hir = ParserBuilder::new().build().parse(regex)?; + string_regex_parsed(&hir) } /// Like `string_regex()`, but allows providing a pre-parsed expression. @@ -161,8 +162,20 @@ pub fn string_regex_parsed(expr: &Hir) -> ParseResult { /// Creates a strategy which generates byte strings matching the given regular /// expression. +/// +/// By default, the byte strings generated by this strategy _will_ be valid +/// UTF-8. If you wish to generate byte strings that aren't (necessarily) +/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`. +/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.` +/// will generate newline characters (byte value `0x0A`). See the +/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support) +/// for more information. pub fn bytes_regex(regex: &str) -> ParseResult> { - bytes_regex_parsed(®ex_to_hir(regex)?) + let hir = ParserBuilder::new() + .utf8(false) + .build() + .parse(regex)?; + bytes_regex_parsed(&hir) } /// Like `bytes_regex()`, but allows providing a pre-parsed expression. @@ -340,10 +353,6 @@ fn to_bytes(khar: char) -> Vec { khar.encode_utf8(&mut buf).as_bytes().to_owned() } -fn regex_to_hir(pattern: &str) -> Result { - Ok(Parser::new().parse(pattern)?) -} - fn unsupported(error: &'static str) -> Result { Err(Error::UnsupportedRegex(error)) } @@ -353,9 +362,17 @@ mod test { use std::collections::HashSet; use regex::Regex; + use regex::bytes::Regex as BytesRegex; use super::*; + fn printable_ascii(v: &[u8]) -> String { + v.iter() + .flat_map(|c| std::ascii::escape_default(*c)) + .map(|c| char::from_u32(c.into()).unwrap()) + .collect() + } + fn do_test( pattern: &str, min_distinct: usize, @@ -379,6 +396,29 @@ mod test { ); } + fn do_test_bytes( + pattern: &str, + min_distinct: usize, + max_distinct: usize, + iterations: usize, + ) { + let generated = generate_byte_values_matching_regex(pattern, iterations); + assert!( + generated.len() >= min_distinct, + "Expected to generate at least {} strings, but only \ + generated {}", + min_distinct, + generated.len() + ); + assert!( + generated.len() <= max_distinct, + "Expected to generate at most {} strings, but \ + generated {}", + max_distinct, + generated.len() + ); + } + fn generate_values_matching_regex( pattern: &str, iterations: usize, @@ -415,6 +455,42 @@ mod test { generated } + fn generate_byte_values_matching_regex( + pattern: &str, + iterations: usize, + ) -> HashSet> { + let rx = BytesRegex::new(pattern).unwrap(); + let mut generated = HashSet::new(); + + let strategy = bytes_regex(pattern).unwrap(); + let mut runner = TestRunner::deterministic(); + for _ in 0..iterations { + let mut value = strategy.new_tree(&mut runner).unwrap(); + + loop { + let s = value.current(); + let ok = if let Some(matsch) = rx.find(&s) { + 0 == matsch.start() && s.len() == matsch.end() + } else { + false + }; + if !ok { + panic!( + "Generated string {:?} which does not match {:?}", + printable_ascii(&s), pattern + ); + } + + generated.insert(s); + + if !value.simplify() { + break; + } + } + } + generated + } + #[test] fn test_case_insensitive_produces_all_available_values() { let mut expected: HashSet = HashSet::new(); @@ -428,6 +504,7 @@ mod test { #[test] fn test_literal() { do_test("foo", 1, 1, 8); + do_test_bytes("foo", 1, 1, 8); } #[test] @@ -438,36 +515,43 @@ mod test { #[test] fn test_alternation() { do_test("foo|bar|baz", 3, 3, 16); + do_test_bytes("foo|bar|baz", 3, 3, 16); } #[test] - fn test_repitition() { + fn test_repetition() { do_test("a{0,8}", 9, 9, 64); + do_test_bytes("a{0,8}", 9, 9, 64); } #[test] fn test_question() { do_test("a?", 2, 2, 16); + do_test_bytes("a?", 2, 2, 16); } #[test] fn test_star() { do_test("a*", 33, 33, 256); + do_test_bytes("a*", 33, 33, 256); } #[test] fn test_plus() { do_test("a+", 32, 32, 256); + do_test_bytes("a+", 32, 32, 256); } #[test] fn test_n_to_range() { do_test("a{4,}", 4, 4, 64); + do_test_bytes("a{4,}", 4, 4, 64); } #[test] fn test_concatenation() { do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32); + do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32); } #[test] @@ -488,6 +572,7 @@ mod test { #[test] fn test_dot_s() { do_test("(?s).", 200, 65536, 256); + do_test_bytes("(?s-u).", 256, 256, 2048); } #[test] @@ -495,6 +580,16 @@ mod test { do_test("\\d+", 1, 65536, 256); } + #[test] + fn test_non_utf8_byte_strings() { + do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512); + do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512); + do_test_bytes(r#"(?x-u) + \xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf ) + ( \xed ( \xb0\x80 | \xbf\xbf ) )? ) + | \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120); + } + fn assert_send_and_sync(_: T) {} #[test]