syntax: rename 'allow_invalid_utf8' to 'utf8'

This also inverts its meaning, i.e., utf8=!allow_invalid_utf8. This naming is consistent with the naming used in regex-automata. In general, I find that using names without negations in them to be clearer, since it avoids double negations.
rust-lang · Apr 17, 2023 · 706b07d · 706b07d
1 parent ba9b786
commit 706b07d
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 49 deletions.
diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs
@@ -2253,11 +2253,7 @@ mod tests {
     use super::*;
 
     fn parse(pattern: &str) -> Hir {
-        crate::ParserBuilder::new()
-            .allow_invalid_utf8(true)
-            .build()
-            .parse(pattern)
-            .unwrap()
+        crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap()
     }
 
     fn prefixes(pattern: &str) -> Seq {

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -71,7 +71,7 @@ pub enum ErrorKind {
     /// support is disabled. For example `(?-u:\pL)` would trigger this error.
     UnicodeNotAllowed,
     /// This error occurs when translating a pattern that could match a byte
-    /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
+    /// sequence that isn't UTF-8 and `utf8` was enabled.
     InvalidUtf8,
     /// This occurs when an unrecognized Unicode property name could not
     /// be found.

diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
@@ -309,7 +309,7 @@ mod tests {
     }
 
     fn roundtrip_bytes(given: &str, expected: &str) {
-        roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
+        roundtrip_with(|b| b.utf8(false), given, expected);
     }
 
     fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -18,7 +18,7 @@ type Result<T> = core::result::Result<T, Error>;
 /// A builder for constructing an AST->HIR translator.
 #[derive(Clone, Debug)]
 pub struct TranslatorBuilder {
-    allow_invalid_utf8: bool,
+    utf8: bool,
     flags: Flags,
 }
 
@@ -31,34 +31,35 @@ impl Default for TranslatorBuilder {
 impl TranslatorBuilder {
     /// Create a new translator builder with a default c onfiguration.
     pub fn new() -> TranslatorBuilder {
-        TranslatorBuilder {
-            allow_invalid_utf8: false,
-            flags: Flags::default(),
-        }
+        TranslatorBuilder { utf8: true, flags: Flags::default() }
     }
 
     /// Build a translator using the current configuration.
     pub fn build(&self) -> Translator {
         Translator {
             stack: RefCell::new(vec![]),
             flags: Cell::new(self.flags),
-            allow_invalid_utf8: self.allow_invalid_utf8,
+            utf8: self.utf8,
         }
     }
 
-    /// When enabled, translation will permit the construction of a regular
+    /// When disabled, translation will permit the construction of a regular
     /// expression that may match invalid UTF-8.
     ///
-    /// When disabled (the default), the translator is guaranteed to produce
-    /// an expression that will only ever match valid UTF-8 (otherwise, the
-    /// translator will return an error).
+    /// When enabled (the default), the translator is guaranteed to produce an
+    /// expression that, for non-empty matches, will only ever produce spans
+    /// that are entirely valid UTF-8 (otherwise, the translator will return an
+    /// error).
     ///
-    /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
-    /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
-    /// the parser to return an error. Namely, a negated ASCII word boundary
-    /// can result in matching positions that aren't valid UTF-8 boundaries.
-    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
-        self.allow_invalid_utf8 = yes;
+    /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
+    /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
+    /// syntax) will be allowed even though they can produce matches that split
+    /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
+    /// matches, and it is expected that the regex engine itself must handle
+    /// these cases if necessary (perhaps by suppressing any zero-width matches
+    /// that split a codepoint).
+    pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
+        self.utf8 = yes;
         self
     }
 
@@ -112,7 +113,7 @@ pub struct Translator {
     /// The current flag settings.
     flags: Cell<Flags>,
     /// Whether we're allowed to produce HIR that can match arbitrary bytes.
-    allow_invalid_utf8: bool,
+    utf8: bool,
 }
 
 impl Translator {
@@ -162,8 +163,8 @@ enum HirFrame {
     /// recursive structure).
     ///
     /// Byte character classes are created when Unicode mode (`u`) is disabled.
-    /// If `allow_invalid_utf8` is disabled (the default), then a byte
-    /// character is only permitted to match ASCII text.
+    /// If `utf8` is enabled (the default), then a byte character is only
+    /// permitted to match ASCII text.
     ClassBytes(hir::ClassBytes),
     /// This is pushed whenever a repetition is observed. After visiting every
     /// sub-expression in the repetition, the translator's stack is expected to
@@ -805,7 +806,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         if byte <= 0x7F {
             return Ok(Either::Left(char::try_from(byte).unwrap()));
         }
-        if !self.trans().allow_invalid_utf8 {
+        if self.trans().utf8 {
             return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
         }
         Ok(Either::Right(byte))
@@ -856,7 +857,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
     }
 
     fn hir_dot(&self, span: Span) -> Result<Hir> {
-        if !self.flags().unicode() && !self.trans().allow_invalid_utf8 {
+        if !self.flags().unicode() && self.trans().utf8 {
             return Err(self.error(span, ErrorKind::InvalidUtf8));
         }
         Ok(Hir::dot(self.flags().dot()))
@@ -890,7 +891,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
                     // It is possible for negated ASCII word boundaries to
                     // match at invalid UTF-8 boundaries, even when searching
                     // valid UTF-8.
-                    if !self.trans().allow_invalid_utf8 {
+                    if self.trans().utf8 {
                         return Err(
                             self.error(asst.span, ErrorKind::InvalidUtf8)
                         );
@@ -1039,7 +1040,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         // Negating a Perl byte class is likely to cause it to match invalid
         // UTF-8. That's only OK if the translator is configured to allow such
         // things.
-        if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+        if self.trans().utf8 && !class.is_all_ascii() {
             return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
         }
         Ok(class)
@@ -1107,7 +1108,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         if negated {
             class.negate();
         }
-        if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+        if self.trans().utf8 && !class.is_all_ascii() {
             return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
         }
         Ok(())
@@ -1313,23 +1314,23 @@ mod tests {
 
     fn t(pattern: &str) -> Hir {
         TranslatorBuilder::new()
-            .allow_invalid_utf8(false)
+            .utf8(true)
             .build()
             .translate(pattern, &parse(pattern))
             .unwrap()
     }
 
     fn t_err(pattern: &str) -> hir::Error {
         TranslatorBuilder::new()
-            .allow_invalid_utf8(false)
+            .utf8(true)
             .build()
             .translate(pattern, &parse(pattern))
             .unwrap_err()
     }
 
     fn t_bytes(pattern: &str) -> Hir {
         TranslatorBuilder::new()
-            .allow_invalid_utf8(true)
+            .utf8(false)
             .build()
             .translate(pattern, &parse(pattern))
             .unwrap()

diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
@@ -74,19 +74,23 @@ impl ParserBuilder {
         self
     }
 
-    /// When enabled, the parser will permit the construction of a regular
+    /// When disabled, translation will permit the construction of a regular
     /// expression that may match invalid UTF-8.
     ///
-    /// When disabled (the default), the parser is guaranteed to produce
-    /// an expression that will only ever match valid UTF-8 (otherwise, the
-    /// parser will return an error).
-    ///
-    /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
-    /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
-    /// the parser to return an error. Namely, a negated ASCII word boundary
-    /// can result in matching positions that aren't valid UTF-8 boundaries.
-    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
-        self.hir.allow_invalid_utf8(yes);
+    /// When enabled (the default), the translator is guaranteed to produce an
+    /// expression that, for non-empty matches, will only ever produce spans
+    /// that are entirely valid UTF-8 (otherwise, the translator will return an
+    /// error).
+    ///
+    /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
+    /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
+    /// syntax) will be allowed even though they can produce matches that split
+    /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
+    /// matches, and it is expected that the regex engine itself must handle
+    /// these cases if necessary (perhaps by suppressing any zero-width matches
+    /// that split a codepoint).
+    pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
+        self.hir.utf8(yes);
         self
     }
 
@@ -144,9 +148,9 @@ impl ParserBuilder {
     /// By default this is **enabled**. It may alternatively be selectively
     /// disabled in the regular expression itself via the `u` flag.
     ///
-    /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
-    /// default), a regular expression will fail to parse if Unicode mode is
-    /// disabled and a sub-expression could possibly match invalid UTF-8.
+    /// Note that unless `utf8` is disabled (it's enabled by default), a
+    /// regular expression will fail to parse if Unicode mode is disabled and a
+    /// sub-expression could possibly match invalid UTF-8.
     pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
         self.hir.unicode(yes);
         self

diff --git a/src/exec.rs b/src/exec.rs
@@ -246,7 +246,7 @@ impl ExecBuilder {
                 .swap_greed(self.options.swap_greed)
                 .ignore_whitespace(self.options.ignore_whitespace)
                 .unicode(self.options.unicode)
-                .allow_invalid_utf8(!self.only_utf8)
+                .utf8(self.only_utf8)
                 .nest_limit(self.options.nest_limit)
                 .build();
             let expr =