Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix boundary handling in Regex auto-atomicity optimization #79088

Merged
merged 1 commit into from
Dec 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2117,10 +2117,10 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch:
case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch):
case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch):
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
break;

Expand Down Expand Up @@ -2163,10 +2163,10 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i

case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
break;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,25 @@ public static IEnumerable<object[]> Matches_TestData()
}
};

foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.ECMAScript })
{
if (RegexHelpers.IsNonBacktracking(engine))
{
continue;
}

yield return new object[]
{
engine,
@"a?\b", "ac", options,
new[]
{
new CaptureData("", 0, 0),
new CaptureData("", 2, 0),
}
};
}

if (!PlatformDetection.IsNetFramework)
{
// .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,10 @@ public class RegexReductionTests
[InlineData("[^\n]*\n+", "(?>[^\n]*)(?>\n+)")]
[InlineData("(a+)b", "((?>a+))b")]
[InlineData("a*(?:bcd|efg)", "(?>a*)(?:bcd|efg)")]
[InlineData("\\w*\\b", "(?>\\w*)\\b")]
[InlineData("\\d*\\b", "(?>\\d*)\\b")]
[InlineData("\\w+\\b", "(?>\\w+)\\b")]
[InlineData("\\d+\\b", "(?>\\d+)\\b")]
[InlineData("\\W+\\B", "(?>\\W+)\\B")]
[InlineData("\\D+\\B", "(?>\\D+)\\B")]
[InlineData("(?:abc*|def*)g", "(?:ab(?>c*)|de(?>f*))g")]
[InlineData("(?:a[ce]*|b*)g", "(?:a(?>[ce]*)|(?>b*))g")]
[InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
Expand Down Expand Up @@ -476,6 +478,11 @@ public void PatternsReduceIdentically(string actual, string expected)
[InlineData(@"\w*\b\w+", @"(?>\w*)\b\w+")]
[InlineData(@"\W+\B\W+", @"(?>\W+)\B\W")]
[InlineData(@"\W*\B\W+", @"(?>\W*)\B\W")]
[InlineData(@"a?\b", @"(?>a?)\b")]
[InlineData(@"\w*\b", @"(?>\w*)\b")]
[InlineData(@"\d*\b", @"(?>\d*)\b")]
[InlineData(@"\W*\B", @"(?>\W*)\B")]
[InlineData(@"\D*\B", @"(?>\D*)\B")]
// Loops inside alternation constructs
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]
Expand Down