Skip to content

Commit

Permalink
syntax: rename 'allow_invalid_utf8' to 'utf8'
Browse files Browse the repository at this point in the history
This also inverts its meaning, i.e., utf8=!allow_invalid_utf8. This
naming is consistent with the naming used in regex-automata. In general,
I find that using names without negations in them to be clearer, since
it avoids double negations.
  • Loading branch information
BurntSushi committed Apr 17, 2023
1 parent ba9b786 commit 706b07d
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 49 deletions.
6 changes: 1 addition & 5 deletions regex-syntax/src/hir/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2253,11 +2253,7 @@ mod tests {
use super::*;

fn parse(pattern: &str) -> Hir {
crate::ParserBuilder::new()
.allow_invalid_utf8(true)
.build()
.parse(pattern)
.unwrap()
crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap()
}

fn prefixes(pattern: &str) -> Seq {
Expand Down
2 changes: 1 addition & 1 deletion regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ pub enum ErrorKind {
/// support is disabled. For example `(?-u:\pL)` would trigger this error.
UnicodeNotAllowed,
/// This error occurs when translating a pattern that could match a byte
/// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
/// sequence that isn't UTF-8 and `utf8` was enabled.
InvalidUtf8,
/// This occurs when an unrecognized Unicode property name could not
/// be found.
Expand Down
2 changes: 1 addition & 1 deletion regex-syntax/src/hir/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ mod tests {
}

fn roundtrip_bytes(given: &str, expected: &str) {
roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
roundtrip_with(|b| b.utf8(false), given, expected);
}

fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
Expand Down
55 changes: 28 additions & 27 deletions regex-syntax/src/hir/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ type Result<T> = core::result::Result<T, Error>;
/// A builder for constructing an AST->HIR translator.
#[derive(Clone, Debug)]
pub struct TranslatorBuilder {
allow_invalid_utf8: bool,
utf8: bool,
flags: Flags,
}

Expand All @@ -31,34 +31,35 @@ impl Default for TranslatorBuilder {
impl TranslatorBuilder {
/// Create a new translator builder with a default c onfiguration.
pub fn new() -> TranslatorBuilder {
TranslatorBuilder {
allow_invalid_utf8: false,
flags: Flags::default(),
}
TranslatorBuilder { utf8: true, flags: Flags::default() }
}

/// Build a translator using the current configuration.
pub fn build(&self) -> Translator {
Translator {
stack: RefCell::new(vec![]),
flags: Cell::new(self.flags),
allow_invalid_utf8: self.allow_invalid_utf8,
utf8: self.utf8,
}
}

/// When enabled, translation will permit the construction of a regular
/// When disabled, translation will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
/// When disabled (the default), the translator is guaranteed to produce
/// an expression that will only ever match valid UTF-8 (otherwise, the
/// translator will return an error).
/// When enabled (the default), the translator is guaranteed to produce an
/// expression that, for non-empty matches, will only ever produce spans
/// that are entirely valid UTF-8 (otherwise, the translator will return an
/// error).
///
/// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
/// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
/// the parser to return an error. Namely, a negated ASCII word boundary
/// can result in matching positions that aren't valid UTF-8 boundaries.
pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.allow_invalid_utf8 = yes;
/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
/// syntax) will be allowed even though they can produce matches that split
/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
/// matches, and it is expected that the regex engine itself must handle
/// these cases if necessary (perhaps by suppressing any zero-width matches
/// that split a codepoint).
pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.utf8 = yes;
self
}

Expand Down Expand Up @@ -112,7 +113,7 @@ pub struct Translator {
/// The current flag settings.
flags: Cell<Flags>,
/// Whether we're allowed to produce HIR that can match arbitrary bytes.
allow_invalid_utf8: bool,
utf8: bool,
}

impl Translator {
Expand Down Expand Up @@ -162,8 +163,8 @@ enum HirFrame {
/// recursive structure).
///
/// Byte character classes are created when Unicode mode (`u`) is disabled.
/// If `allow_invalid_utf8` is disabled (the default), then a byte
/// character is only permitted to match ASCII text.
/// If `utf8` is enabled (the default), then a byte character is only
/// permitted to match ASCII text.
ClassBytes(hir::ClassBytes),
/// This is pushed whenever a repetition is observed. After visiting every
/// sub-expression in the repetition, the translator's stack is expected to
Expand Down Expand Up @@ -805,7 +806,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
if byte <= 0x7F {
return Ok(Either::Left(char::try_from(byte).unwrap()));
}
if !self.trans().allow_invalid_utf8 {
if self.trans().utf8 {
return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
}
Ok(Either::Right(byte))
Expand Down Expand Up @@ -856,7 +857,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
}

fn hir_dot(&self, span: Span) -> Result<Hir> {
if !self.flags().unicode() && !self.trans().allow_invalid_utf8 {
if !self.flags().unicode() && self.trans().utf8 {
return Err(self.error(span, ErrorKind::InvalidUtf8));
}
Ok(Hir::dot(self.flags().dot()))
Expand Down Expand Up @@ -890,7 +891,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
// It is possible for negated ASCII word boundaries to
// match at invalid UTF-8 boundaries, even when searching
// valid UTF-8.
if !self.trans().allow_invalid_utf8 {
if self.trans().utf8 {
return Err(
self.error(asst.span, ErrorKind::InvalidUtf8)
);
Expand Down Expand Up @@ -1039,7 +1040,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
// Negating a Perl byte class is likely to cause it to match invalid
// UTF-8. That's only OK if the translator is configured to allow such
// things.
if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
if self.trans().utf8 && !class.is_all_ascii() {
return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
}
Ok(class)
Expand Down Expand Up @@ -1107,7 +1108,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
if negated {
class.negate();
}
if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
if self.trans().utf8 && !class.is_all_ascii() {
return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
}
Ok(())
Expand Down Expand Up @@ -1313,23 +1314,23 @@ mod tests {

fn t(pattern: &str) -> Hir {
TranslatorBuilder::new()
.allow_invalid_utf8(false)
.utf8(true)
.build()
.translate(pattern, &parse(pattern))
.unwrap()
}

fn t_err(pattern: &str) -> hir::Error {
TranslatorBuilder::new()
.allow_invalid_utf8(false)
.utf8(true)
.build()
.translate(pattern, &parse(pattern))
.unwrap_err()
}

fn t_bytes(pattern: &str) -> Hir {
TranslatorBuilder::new()
.allow_invalid_utf8(true)
.utf8(false)
.build()
.translate(pattern, &parse(pattern))
.unwrap()
Expand Down
32 changes: 18 additions & 14 deletions regex-syntax/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,19 +74,23 @@ impl ParserBuilder {
self
}

/// When enabled, the parser will permit the construction of a regular
/// When disabled, translation will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
/// When disabled (the default), the parser is guaranteed to produce
/// an expression that will only ever match valid UTF-8 (otherwise, the
/// parser will return an error).
///
/// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
/// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
/// the parser to return an error. Namely, a negated ASCII word boundary
/// can result in matching positions that aren't valid UTF-8 boundaries.
pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.allow_invalid_utf8(yes);
/// When enabled (the default), the translator is guaranteed to produce an
/// expression that, for non-empty matches, will only ever produce spans
/// that are entirely valid UTF-8 (otherwise, the translator will return an
/// error).
///
/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
/// syntax) will be allowed even though they can produce matches that split
/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
/// matches, and it is expected that the regex engine itself must handle
/// these cases if necessary (perhaps by suppressing any zero-width matches
/// that split a codepoint).
pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.utf8(yes);
self
}

Expand Down Expand Up @@ -144,9 +148,9 @@ impl ParserBuilder {
/// By default this is **enabled**. It may alternatively be selectively
/// disabled in the regular expression itself via the `u` flag.
///
/// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
/// default), a regular expression will fail to parse if Unicode mode is
/// disabled and a sub-expression could possibly match invalid UTF-8.
/// Note that unless `utf8` is disabled (it's enabled by default), a
/// regular expression will fail to parse if Unicode mode is disabled and a
/// sub-expression could possibly match invalid UTF-8.
pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.unicode(yes);
self
Expand Down
2 changes: 1 addition & 1 deletion src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ impl ExecBuilder {
.swap_greed(self.options.swap_greed)
.ignore_whitespace(self.options.ignore_whitespace)
.unicode(self.options.unicode)
.allow_invalid_utf8(!self.only_utf8)
.utf8(self.only_utf8)
.nest_limit(self.options.nest_limit)
.build();
let expr =
Expand Down

0 comments on commit 706b07d

Please sign in to comment.