Use StartsWith(..., OrdinalIgnoreCase) in RegexCompiler / source gene…

…rator (#66339) * Use StartsWith(..., OrdinalIgnoreCase) in RegexCompiler / source generator When we encounter a sequence of sets representing case-insensitive ASCII, we can simplify the code generated to just call StartsWith, which both makes it more readable but also takes advantage of the new JIT optimization to lower that into efficient vectorized comparisons based on the supplied literal. This also cleans up some formatting in the source generator emitted code to make things much more concise and less noisy. * Address PR feedback
dotnet · Mar 10, 2022 · 28580bf · 28580bf
1 parent cdb1d26
commit 28580bf
Show file tree

Hide file tree

Showing 4 changed files with 280 additions and 103 deletions.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -1021,6 +1021,9 @@ void CaseGoto(string clause, string label)
             }
 
             // Whether the node has RegexOptions.IgnoreCase set.
+            // TODO: https://github.com/dotnet/runtime/issues/61048. We should be able to delete this and all usage sites once
+            // IgnoreCase is erradicated from the tree.  The only place it should possibly be left after that work is in a Backreference,
+            // and that can do this check directly as needed.
             static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
 
             // Slices the inputSpan starting at pos until end and stores it into slice.
@@ -1127,7 +1130,7 @@ void EmitAlternation(RegexNode node)
                         // If it's not a One, Multi, or Set, we can't apply this optimization.
                         // If it's IgnoreCase (and wasn't reduced to a non-IgnoreCase set), also ignore it to keep the logic simple.
                         if (node.Child(i).FindBranchOneMultiOrSetStart() is not RegexNode oneMultiOrSet ||
-                            (oneMultiOrSet.Options & RegexOptions.IgnoreCase) != 0) // TODO: https://github.com/dotnet/runtime/issues/61048
+                            IsCaseInsensitive(oneMultiOrSet))
                         {
                             useSwitchedBranches = false;
                             break;
@@ -1208,7 +1211,7 @@ void EmitSwitchedBranches()
 
                             RegexNode? childStart = child.FindBranchOneMultiOrSetStart();
                             Debug.Assert(childStart is not null, "Unexpectedly couldn't find the branch starting node.");
-                            Debug.Assert((childStart.Options & RegexOptions.IgnoreCase) == 0, "Expected only to find non-IgnoreCase branch starts");
+                            Debug.Assert(!IsCaseInsensitive(childStart), "Expected only to find non-IgnoreCase branch starts");
 
                             if (childStart.Kind is RegexNodeKind.Set)
                             {
@@ -2051,14 +2054,15 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
                     TransferSliceStaticPosToPos();
                 }
 
-                // Separate out several node types that, for conciseness, don't need a header and scope written into the source.
+                // Separate out several node types that, for conciseness, don't need a header nor scope written into the source.
+                // Effectively these either evaporate, are completely self-explanatory, or only exist for their children to be rendered.
                 switch (node.Kind)
                 {
-                    // Nothing is written for an empty
+                    // Nothing is written for an empty.
                     case RegexNodeKind.Empty:
                         return;
 
-                    // A match failure doesn't need a scope.
+                    // A single-line goto for a failure doesn't need a scope or comment.
                     case RegexNodeKind.Nothing:
                         Goto(doneLabel);
                         return;
@@ -2075,79 +2079,95 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
                         return;
                 }
 
-                // Put the node's code into its own scope. If the node contains labels that may need to
-                // be visible outside of its scope, the scope is still emitted for clarity but is commented out.
-                using (EmitScope(writer, DescribeNode(node, analysis), faux: analysis.MayBacktrack(node)))
+                // For everything else, output a comment about what the node is.
+                writer.WriteLine($"// {DescribeNode(node, analysis)}");
+
+                // Separate out several node types that, for conciseness, don't need a scope written into the source as they're
+                // always a single statement / block.
+                switch (node.Kind)
                 {
-                    switch (node.Kind)
-                    {
-                        case RegexNodeKind.Beginning:
-                        case RegexNodeKind.Start:
-                        case RegexNodeKind.Bol:
-                        case RegexNodeKind.Eol:
-                        case RegexNodeKind.End:
-                        case RegexNodeKind.EndZ:
-                            EmitAnchors(node);
-                            break;
+                    case RegexNodeKind.Beginning:
+                    case RegexNodeKind.Start:
+                    case RegexNodeKind.Bol:
+                    case RegexNodeKind.Eol:
+                    case RegexNodeKind.End:
+                    case RegexNodeKind.EndZ:
+                        EmitAnchors(node);
+                        return;
 
-                        case RegexNodeKind.Boundary:
-                        case RegexNodeKind.NonBoundary:
-                        case RegexNodeKind.ECMABoundary:
-                        case RegexNodeKind.NonECMABoundary:
-                            EmitBoundary(node);
-                            break;
+                    case RegexNodeKind.Boundary:
+                    case RegexNodeKind.NonBoundary:
+                    case RegexNodeKind.ECMABoundary:
+                    case RegexNodeKind.NonECMABoundary:
+                        EmitBoundary(node);
+                        return;
 
+                    case RegexNodeKind.One:
+                    case RegexNodeKind.Notone:
+                    case RegexNodeKind.Set:
+                        EmitSingleChar(node, emitLengthChecksIfRequired);
+                        return;
+
+                    case RegexNodeKind.Multi when !IsCaseInsensitive(node) && (node.Options & RegexOptions.RightToLeft) == 0:
+                        EmitMultiChar(node, emitLengthChecksIfRequired);
+                        return;
+
+                    case RegexNodeKind.UpdateBumpalong:
+                        EmitUpdateBumpalong(node);
+                        return;
+                }
+
+                // For everything else, put the node's code into its own scope, purely for readability. If the node contains labels
+                // that may need to be visible outside of its scope, the scope is still emitted for clarity but is commented out.
+                using (EmitScope(writer, null, faux: analysis.MayBacktrack(node)))
+                {
+                    switch (node.Kind)
+                    {
                         case RegexNodeKind.Multi:
                             EmitMultiChar(node, emitLengthChecksIfRequired);
-                            break;
-
-                        case RegexNodeKind.One:
-                        case RegexNodeKind.Notone:
-                        case RegexNodeKind.Set:
-                            EmitSingleChar(node, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Oneloop:
                         case RegexNodeKind.Notoneloop:
                         case RegexNodeKind.Setloop:
                             EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Onelazy:
                         case RegexNodeKind.Notonelazy:
                         case RegexNodeKind.Setlazy:
                             EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Oneloopatomic:
                         case RegexNodeKind.Notoneloopatomic:
                         case RegexNodeKind.Setloopatomic:
                             EmitSingleCharAtomicLoop(node, emitLengthChecksIfRequired);
-                            break;
+                            return;
 
                         case RegexNodeKind.Loop:
                             EmitLoop(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Lazyloop:
                             EmitLazy(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Alternate:
                             EmitAlternation(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Backreference:
                             EmitBackreference(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.BackreferenceConditional:
                             EmitBackreferenceConditional(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.ExpressionConditional:
                             EmitExpressionConditional(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.Atomic:
                             Debug.Assert(analysis.MayBacktrack(node.Child(0)));
@@ -2156,25 +2176,20 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
 
                         case RegexNodeKind.Capture:
                             EmitCapture(node, subsequent);
-                            break;
+                            return;
 
                         case RegexNodeKind.PositiveLookaround:
                             EmitPositiveLookaroundAssertion(node);
-                            break;
+                            return;
 
                         case RegexNodeKind.NegativeLookaround:
                             EmitNegativeLookaroundAssertion(node);
-                            break;
-
-                        case RegexNodeKind.UpdateBumpalong:
-                            EmitUpdateBumpalong(node);
-                            break;
-
-                        default:
-                            Debug.Fail($"Unexpected node type: {node.Kind}");
-                            break;
+                            return;
                     }
                 }
+
+                // All nodes should have been handled.
+                Debug.Fail($"Unexpected node type: {node.Kind}");
             }
 
             // Emits the node for an atomic.
@@ -2230,7 +2245,8 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe
                 for (int i = 0; i < childCount; i++)
                 {
                     // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence
-                    // and then skip the individual length checks for each.  We also want to minimize the repetition of if blocks,
+                    // and then skip the individual length checks for each.  We can also discover case-insensitive sequences that
+                    // can be checked efficiently with methods like StartsWith. We also want to minimize the repetition of if blocks,
                     // and so we try to emit a series of clauses all part of the same if block rather than one if block per child.
                     if ((node.Options & RegexOptions.RightToLeft) == 0 &&
                         emitLengthChecksIfRequired &&
@@ -2243,7 +2259,7 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe
                         {
                             for (; i < exclusiveEnd; i++)
                             {
-                                void WriteSingleCharChild(RegexNode child, bool includeDescription = true)
+                                void WritePrefix()
                                 {
                                     if (wroteClauses)
                                     {
@@ -2254,31 +2270,41 @@ void WriteSingleCharChild(RegexNode child, bool includeDescription = true)
                                     {
                                         writer.Write("if (");
                                     }
-                                    EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true);
-                                    prevDescription = includeDescription ? DescribeNode(child, analysis) : null;
-                                    wroteClauses = true;
                                 }
 
                                 RegexNode child = node.Child(i);
-                                if (child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set)
+                                if (node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, out int nodesConsumed, out string? caseInsensitiveString))
                                 {
-                                    WriteSingleCharChild(child);
+                                    WritePrefix();
+                                    string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan;
+                                    writer.Write($"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(caseInsensitiveString)}, global::System.StringComparison.OrdinalIgnoreCase)");
+                                    prevDescription = $"Match the string {Literal(caseInsensitiveString)} (ordinal case-insensitive)";
+                                    wroteClauses = true;
+
+                                    sliceStaticPos += caseInsensitiveString.Length;
+                                    i += nodesConsumed - 1;
                                 }
-                                else if (child.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or RegexNodeKind.Oneloopatomic or
-                                                       RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic or
-                                                       RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloopatomic &&
+                                else if (child.Kind is RegexNodeKind.Multi && !IsCaseInsensitive(child))
+                                {
+                                    WritePrefix();
+                                    EmitMultiCharString(child.Str!, caseInsensitive: false, emitLengthCheck: false, clauseOnly: true, rightToLeft: false);
+                                    prevDescription = DescribeNode(child, analysis);
+                                    wroteClauses = true;
+                                }
+                                else if ((child.IsOneFamily || child.IsNotoneFamily || child.IsSetFamily) &&
                                          child.M == child.N &&
                                          child.M <= MaxUnrollSize)
                                 {
-                                    for (int c = 0; c < child.M; c++)
+                                    int repeatCount = child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set ? 1 : child.M;
+                                    for (int c = 0; c < repeatCount; c++)
                                     {
-                                        WriteSingleCharChild(child, includeDescription: c == 0);
+                                        WritePrefix();
+                                        EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true);
+                                        prevDescription = c == 0 ? DescribeNode(child, analysis) : null;
+                                        wroteClauses = true;
                                     }
                                 }
-                                else
-                                {
-                                    break;
-                                }
+                                else break;
                             }
 
                             if (wroteClauses)
@@ -2486,12 +2512,13 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck)
             {
                 Debug.Assert(node.Kind is RegexNodeKind.Multi, $"Unexpected type: {node.Kind}");
                 Debug.Assert(node.Str is not null);
-                EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, (node.Options & RegexOptions.RightToLeft) != 0);
+                EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, clauseOnly: false, (node.Options & RegexOptions.RightToLeft) != 0);
             }
 
-            void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool rightToLeft)
+            void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool clauseOnly, bool rightToLeft)
             {
                 Debug.Assert(str.Length >= 2);
+                Debug.Assert(!clauseOnly || (!emitLengthCheck && !caseInsensitive && !rightToLeft));
 
                 if (rightToLeft)
                 {
@@ -2534,9 +2561,17 @@ void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck,
                 else
                 {
                     string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan;
-                    using (EmitBlock(writer, $"if (!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)}))"))
+                    string clause = $"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)})";
+                    if (clauseOnly)
                     {
-                        Goto(doneLabel);
+                        writer.Write(clause);
+                    }
+                    else
+                    {
+                        using (EmitBlock(writer, $"if ({clause})"))
+                        {
+                            Goto(doneLabel);
+                        }
                     }
                 }
 
@@ -3061,7 +3096,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true)
                     case <= RegexNode.MultiVsRepeaterLimit when node.IsOneFamily && !IsCaseInsensitive(node):
                         // This is a repeated case-sensitive character; emit it as a multi in order to get all the optimizations
                         // afforded to a multi, e.g. unrolling the loop with multi-char reads/comparisons at a time.
-                        EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, rtl);
+                        EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, clauseOnly: false, rtl);
                         return;
                 }
 

diff --git a/...aries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/...aries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -887,6 +887,20 @@ public static bool ParticipatesInCaseConversion(ReadOnlySpan<char> s)
             return false;
         }
 
+        /// <summary>Gets whether the specified span contains only ASCII.</summary>
+        public static bool IsAscii(ReadOnlySpan<char> s) // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available
+        {
+            foreach (char c in s)
+            {
+                if (c >= 128)
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
         /// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
         /// <remarks>This may enumerate negated characters if the set is negated.</remarks>
         private static bool CanEasilyEnumerateSetContents(string set) =>