From 370b3a031ccdf5b88515bd79909b1e592297738f Mon Sep 17 00:00:00 2001
From: Zack Weinberg <zack+gh@owlfolio.org>
Date: Sun, 24 Sep 2023 01:38:50 -0400
Subject: [PATCH] Permit use of (?-u) in byte-regex strategies (#336) (#337)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Permit use of (?-u) in byte-regex strategies (#336)

It is desirable to be able to generate, from a regex, byte sequences
that are not necessarily valid UTF-8.  For example, suppose you have a
parser that accepts any string generated by the regex

```
[0-9]+(\.[0-9]*)?
```

Then, in your test suite, you might want to generate strings from the
complementary regular language, which you could do with the regex

```
(?s:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*)
```

However, this will still only generate valid UTF-8 strings.  Maybe you
are parsing directly from byte sequences read from disk, in which case
you want to test the parser’s ability to reject invalid UTF-8 _as well
as_ valid UTF-8 but not within the accepted language.  Then you want
this slight variation:

```
(?s-u:|[^0-9].*|[0-9]+[^0-9.].*|[0-9]+\.[0-9]*[^0-9].*)
```

But this regex will be rejected by `bytes_regex`, because by default
`regex_syntax::Parser` errors out on any regex that potentially
matches invalid UTF-8.  The application — i.e. proptest — must opt
into use of such regexes.  This patch makes proptest do just that, for
`bytes_regex` only.

There should be no change to the behavior of any existing test suite,
because opting to allow use of `(?-u)` does not change the semantics
of any regex that _doesn’t_ contain `(?-u)`, and any existing regex
that _does_ contain `(?-u)` must be incapable of generating invalid
UTF-8 for other reasons, or `regex_syntax::Parser` would be rejecting it.
(For example, `(?-u:[a-z])` cannot generate invalid UTF-8.)

This patch also adds a bunch of tests for `bytes_regex`, which AFAICT
was not being tested at all.  Some of these use the new functionality
and others don’t.  There is quite a bit of code duplication in the
test helper functions — `do_test` and `do_test_bytes` are almost
identical, as are `generate_values_matching_regex` and
`generate_byte_values_matching_regex`.  I am not good enough at
generic metaprogramming in Rust to factor out the duplication.

* [to squash] Correct for API change in regex-syntax 0.7

Commit
https://github.com/rust-lang/regex/commit/706b07de3d07602ede626e98837b09945e6550b5
renamed ParserBuilder::allow_invalid_utf8
to ParserBuilder::utf8
and inverted the sense of its argument.

Separate commit for review purposes; should be squashed before landing
to preserve bisectability of trunk.
---
 proptest/src/string.rs | 111 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 103 insertions(+), 8 deletions(-)
diff --git a/proptest/src/string.rs b/proptest/src/string.rs
index 55666cb6..935cb21c 100644
--- a/proptest/src/string.rs
+++ b/proptest/src/string.rs
@@ -17,7 +17,7 @@ use core::ops::RangeInclusive;
 use core::u32;
 
 use regex_syntax::hir::{self, Hir, HirKind::*, Repetition};
-use regex_syntax::{Error as ParseError, Parser};
+use regex_syntax::{Error as ParseError, ParserBuilder};
 
 use crate::bool;
 use crate::char;
@@ -144,7 +144,8 @@ impl StrategyFromRegex for Vec<u8> {
 /// If you don't need error handling and aren't limited by setup time, it is
 /// also possible to directly use a `&str` as a strategy with the same effect.
 pub fn string_regex(regex: &str) -> ParseResult<String> {
-    string_regex_parsed(&regex_to_hir(regex)?)
+    let hir = ParserBuilder::new().build().parse(regex)?;
+    string_regex_parsed(&hir)
 }
 
 /// Like `string_regex()`, but allows providing a pre-parsed expression.
@@ -161,8 +162,20 @@ pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
 
 /// Creates a strategy which generates byte strings matching the given regular
 /// expression.
+///
+/// By default, the byte strings generated by this strategy _will_ be valid
+/// UTF-8.  If you wish to generate byte strings that aren't (necessarily)
+/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`.
+/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.`
+/// will generate newline characters (byte value `0x0A`).  See the
+/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support)
+/// for more information.
 pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
-    bytes_regex_parsed(&regex_to_hir(regex)?)
+    let hir = ParserBuilder::new()
+        .utf8(false)
+        .build()
+        .parse(regex)?;
+    bytes_regex_parsed(&hir)
 }
 
 /// Like `bytes_regex()`, but allows providing a pre-parsed expression.
@@ -340,10 +353,6 @@ fn to_bytes(khar: char) -> Vec<u8> {
     khar.encode_utf8(&mut buf).as_bytes().to_owned()
 }
 
-fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
-    Ok(Parser::new().parse(pattern)?)
-}
-
 fn unsupported<T>(error: &'static str) -> Result<T, Error> {
     Err(Error::UnsupportedRegex(error))
 }
@@ -353,9 +362,17 @@ mod test {
     use std::collections::HashSet;
 
     use regex::Regex;
+    use regex::bytes::Regex as BytesRegex;
 
     use super::*;
 
+    fn printable_ascii(v: &[u8]) -> String {
+        v.iter()
+            .flat_map(|c| std::ascii::escape_default(*c))
+            .map(|c| char::from_u32(c.into()).unwrap())
+            .collect()
+    }
+
     fn do_test(
         pattern: &str,
         min_distinct: usize,
@@ -379,6 +396,29 @@ mod test {
         );
     }
 
+    fn do_test_bytes(
+        pattern: &str,
+        min_distinct: usize,
+        max_distinct: usize,
+        iterations: usize,
+    ) {
+        let generated = generate_byte_values_matching_regex(pattern, iterations);
+        assert!(
+            generated.len() >= min_distinct,
+            "Expected to generate at least {} strings, but only \
+             generated {}",
+            min_distinct,
+            generated.len()
+        );
+        assert!(
+            generated.len() <= max_distinct,
+            "Expected to generate at most {} strings, but \
+             generated {}",
+            max_distinct,
+            generated.len()
+        );
+    }
+
     fn generate_values_matching_regex(
         pattern: &str,
         iterations: usize,
@@ -415,6 +455,42 @@ mod test {
         generated
     }
 
+    fn generate_byte_values_matching_regex(
+        pattern: &str,
+        iterations: usize,
+    ) -> HashSet<Vec<u8>> {
+        let rx = BytesRegex::new(pattern).unwrap();
+        let mut generated = HashSet::new();
+
+        let strategy = bytes_regex(pattern).unwrap();
+        let mut runner = TestRunner::deterministic();
+        for _ in 0..iterations {
+            let mut value = strategy.new_tree(&mut runner).unwrap();
+
+            loop {
+                let s = value.current();
+                let ok = if let Some(matsch) = rx.find(&s) {
+                    0 == matsch.start() && s.len() == matsch.end()
+                } else {
+                    false
+                };
+                if !ok {
+                    panic!(
+                        "Generated string {:?} which does not match {:?}",
+                        printable_ascii(&s), pattern
+                    );
+                }
+
+                generated.insert(s);
+
+                if !value.simplify() {
+                    break;
+                }
+            }
+        }
+        generated
+    }
+
     #[test]
     fn test_case_insensitive_produces_all_available_values() {
         let mut expected: HashSet<String> = HashSet::new();
@@ -428,6 +504,7 @@ mod test {
     #[test]
     fn test_literal() {
         do_test("foo", 1, 1, 8);
+        do_test_bytes("foo", 1, 1, 8);
     }
 
     #[test]
@@ -438,36 +515,43 @@ mod test {
     #[test]
     fn test_alternation() {
         do_test("foo|bar|baz", 3, 3, 16);
+        do_test_bytes("foo|bar|baz", 3, 3, 16);
     }
 
     #[test]
-    fn test_repitition() {
+    fn test_repetition() {
         do_test("a{0,8}", 9, 9, 64);
+        do_test_bytes("a{0,8}", 9, 9, 64);
     }
 
     #[test]
     fn test_question() {
         do_test("a?", 2, 2, 16);
+        do_test_bytes("a?", 2, 2, 16);
     }
 
     #[test]
     fn test_star() {
         do_test("a*", 33, 33, 256);
+        do_test_bytes("a*", 33, 33, 256);
     }
 
     #[test]
     fn test_plus() {
         do_test("a+", 32, 32, 256);
+        do_test_bytes("a+", 32, 32, 256);
     }
 
     #[test]
     fn test_n_to_range() {
         do_test("a{4,}", 4, 4, 64);
+        do_test_bytes("a{4,}", 4, 4, 64);
     }
 
     #[test]
     fn test_concatenation() {
         do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
+        do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
     }
 
     #[test]
@@ -488,6 +572,7 @@ mod test {
     #[test]
     fn test_dot_s() {
         do_test("(?s).", 200, 65536, 256);
+        do_test_bytes("(?s-u).", 256, 256, 2048);
     }
 
     #[test]
@@ -495,6 +580,16 @@ mod test {
         do_test("\\d+", 1, 65536, 256);
     }
 
+    #[test]
+    fn test_non_utf8_byte_strings() {
+        do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512);
+        do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512);
+        do_test_bytes(r#"(?x-u)
+  \xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf )
+          ( \xed ( \xb0\x80 | \xbf\xbf ) )? )
+        | \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120);
+    }
+
     fn assert_send_and_sync<T: Send + Sync>(_: T) {}
 
     #[test]