Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Regex to use SearchValues<string> in compiled / source generator TryFindNextStartingPosition #88400

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -735,11 +735,15 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOf_LeftToRight();
EmitIndexOfString_LeftToRight();
break;

case FindNextStartingPositionMode.LeadingString_RightToLeft:
EmitIndexOf_RightToLeft();
EmitIndexOfString_RightToLeft();
break;

case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
EmitIndexOfStrings_LeftToRight();
break;

case FindNextStartingPositionMode.LeadingSet_LeftToRight:
Expand Down Expand Up @@ -964,7 +968,7 @@ bool EmitAnchors()
}

// Emits a case-sensitive left-to-right search for a substring.
void EmitIndexOf_LeftToRight()
void EmitIndexOfString_LeftToRight()
{
RegexFindOptimizations opts = regexTree.FindOptimizations;

Expand Down Expand Up @@ -1010,8 +1014,43 @@ void EmitIndexOf_LeftToRight()
}
}

// Emits a case-sensitive left-to-right search for any one of multiple leading prefixes.
void EmitIndexOfStrings_LeftToRight()
{
RegexFindOptimizations opts = regexTree.FindOptimizations;
Debug.Assert(opts.FindMode == FindNextStartingPositionMode.LeadingStrings_LeftToRight);

string prefixes = string.Join(", ", opts.LeadingPrefixes.Select(prefix => Literal(prefix)));

string fieldName = "s_indexOfAnyStrings_";
using (SHA256 sha = SHA256.Create())
{
#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
fieldName += $"{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(prefixes))).Replace("-", "")}";
#pragma warning restore CA1850
}

if (!requiredHelpers.ContainsKey(fieldName))
{
requiredHelpers.Add(fieldName, new string[]
{
$"/// <summary>Supports searching for any of the strings {EscapeXmlComment(prefixes)}.</summary>",
$"internal static readonly SearchValues<string> {fieldName} = SearchValues.Create(new[] {{ {prefixes} }}, StringComparison.Ordinal);",
});
}

writer.WriteLine($"// The pattern has multiple strings that could begin the match. Search for any of them.");
writer.WriteLine($"// If none can be found, there's no match.");
writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOfAny({HelpersTypeName}.{fieldName});");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
writer.WriteLine("return true;");
}
}

// Emits a case-sensitive right-to-left search for a substring.
void EmitIndexOf_RightToLeft()
void EmitIndexOfString_RightToLeft()
{
string prefix = regexTree.FindOptimizations.LeadingPrefix;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,34 +1,35 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Globalization;

#pragma warning disable CA1823, CS0169, IDE0044 // Fields used via reflection

namespace System.Text.RegularExpressions
{
internal sealed class CompiledRegexRunner : RegexRunner
{
private readonly ScanDelegate _scanMethod;

private readonly SearchValues<char>[]? _searchValues;
/// <summary>Set if the regex uses any SearchValues instances. Accessed via reflection.</summary>
/// <remarks>If the array is non-null, this contains instances of SearchValues{char} or SearchValues{string}.</remarks>
private readonly object[]? _searchValues;

/// <summary>This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase</summary>
/// <summary>Set if the pattern contains backreferences and has RegexOptions.IgnoreCase. Accessed via reflection.</summary>
private readonly CultureInfo? _culture;

#pragma warning disable CA1823, CS0169, IDE0044 // Used via reflection to cache the Case behavior if needed.
/// <summary>Caches a RegexCaseBehavior. Accessed via reflection.</summary>
private RegexCaseBehavior _caseBehavior;
#pragma warning restore CA1823, CS0169, IDE0044

internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan<char> text);

public CompiledRegexRunner(ScanDelegate scan, SearchValues<char>[]? searchValues, CultureInfo? culture)
public CompiledRegexRunner(ScanDelegate scan, object[]? searchValues, CultureInfo? culture)
{
_scanMethod = scan;
_searchValues = searchValues;
_culture = culture;
}

protected internal override void Scan(ReadOnlySpan<char> text)
=> _scanMethod(this, text);
protected internal override void Scan(ReadOnlySpan<char> text) => _scanMethod(this, text);
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Globalization;
using System.Reflection.Emit;

Expand All @@ -10,14 +9,14 @@ namespace System.Text.RegularExpressions
internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory
{
private readonly DynamicMethod _scanMethod;
private readonly SearchValues<char>[]? _searchValues;
private readonly object[]? _searchValues;
/// <summary>This field will only be set if the pattern has backreferences and uses RegexOptions.IgnoreCase</summary>
private readonly CultureInfo? _culture;

// Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed.
private CompiledRegexRunner.ScanDelegate? _scan;

public CompiledRegexRunnerFactory(DynamicMethod scanMethod, SearchValues<char>[]? searchValues, CultureInfo? culture)
public CompiledRegexRunnerFactory(DynamicMethod scanMethod, object[]? searchValues, CultureInfo? culture)
{
_scanMethod = scanMethod;
_searchValues = searchValues;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnySearchValues = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(SearchValues<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnySearchValuesString = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<char>), typeof(SearchValues<string>) })!;
private static readonly MethodInfo s_spanIndexOfAnyExceptChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyExceptCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyExceptCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAnyExcept", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
Expand Down Expand Up @@ -114,8 +115,8 @@ internal abstract class RegexCompiler
/// <summary>Whether this expression has a non-infinite timeout.</summary>
protected bool _hasTimeout;

/// <summary><see cref="SearchValues{T}"/> instances used by the expression. For now these are only ASCII sets.</summary>
protected List<SearchValues<char>>? _searchValues;
/// <summary><see cref="SearchValues{T}"/> instances used by the expression.</summary>
protected List<object>? _searchValues;

/// <summary>Pool of Int32 LocalBuilders.</summary>
private Stack<LocalBuilder>? _int32LocalsPool;
Expand Down Expand Up @@ -459,6 +460,7 @@ protected void EmitTryFindNextPossibleStartingPosition()
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOf_LeftToRight();
break;
Expand Down Expand Up @@ -744,15 +746,15 @@ bool EmitAnchors()
return false;
}

// Emits a case-sensitive left-to-right search for a substring.
// Emits a case-sensitive left-to-right search for a substring or substrings.
void EmitIndexOf_LeftToRight()
{
RegexFindOptimizations opts = _regexTree.FindOptimizations;
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight or FindNextStartingPositionMode.LeadingStrings_LeftToRight);

using RentedLocalBuilder i = RentInt32Local();

// int i = inputSpan.Slice(pos).IndexOf(prefix);
// int i = inputSpan.Slice(pos)...
Ldloca(inputSpan);
Ldloc(pos);
if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
Expand All @@ -762,18 +764,28 @@ void EmitIndexOf_LeftToRight()
Add();
}
Call(s_spanSliceIntMethod);
Ldstr(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
opts.LeadingPrefix :
opts.FixedDistanceLiteral.String!);
Call(s_stringAsSpanMethod);
if (opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight)

// ...IndexOf(prefix);
if (opts.FindMode == FindNextStartingPositionMode.LeadingStrings_LeftToRight)
{
Ldc((int)StringComparison.OrdinalIgnoreCase);
Call(s_spanIndexOfSpanStringComparison);
LoadSearchValues(opts.LeadingPrefixes);
Call(s_spanIndexOfAnySearchValuesString);
}
else
{
Call(s_spanIndexOfSpan);
Ldstr(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
opts.LeadingPrefix :
opts.FixedDistanceLiteral.String!);
Call(s_stringAsSpanMethod);
if (opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight)
{
Ldc((int)StringComparison.OrdinalIgnoreCase);
Call(s_spanIndexOfSpanStringComparison);
}
else
{
Call(s_spanIndexOfSpan);
}
}
Stloc(i);

Expand Down Expand Up @@ -967,12 +979,12 @@ void EmitFixedSet_LeftToRight()
// a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
// characters _not_ contained in the set, and then do a search for the inverse of that, which will be
// all of the target ASCII characters and all of non-ASCII.
var asciiChars = new List<char>();
for (int i = 0; i <= 0x7f; i++)
using var asciiChars = new ValueListBuilder<char>(stackalloc char[128]);
for (int i = 0; i < 128; i++)
{
if (!RegexCharClass.CharInClass((char)i, primarySet.Set))
{
asciiChars.Add((char)i);
asciiChars.Append((char)i);
}
}

Expand All @@ -984,7 +996,7 @@ void EmitFixedSet_LeftToRight()

// int i = span.
Ldloc(span);
if (asciiChars.Count == 128)
if (asciiChars.Length == 128)
{
// IndexOfAnyExceptInRange('\0', '\u007f');
Ldc(0);
Expand All @@ -994,7 +1006,7 @@ void EmitFixedSet_LeftToRight()
else
{
// IndexOfAnyExcept(searchValuesArray[...]);
LoadSearchValues(CollectionsMarshal.AsSpan(asciiChars));
LoadSearchValues(asciiChars.AsSpan().ToArray());
Call(s_spanIndexOfAnyExceptSearchValues);
}
Stloc(i);
Expand Down Expand Up @@ -6112,13 +6124,16 @@ private void EmitTimeoutCheckIfNeeded()
}

/// <summary>
/// Adds an entry in <see cref="CompiledRegexRunner._searchValues"/> for the given <paramref name="chars"/> and emits a load of that initialized value.
/// Adds an entry in <see cref="CompiledRegexRunner._searchValues"/> for the given <paramref name="values"/> and emits a load of that initialized value.
/// </summary>
private void LoadSearchValues(ReadOnlySpan<char> chars)
private void LoadSearchValues<T>(T[] values)
{
List<SearchValues<char>> list = _searchValues ??= new();
List<object> list = _searchValues ??= new();
int index = list.Count;
list.Add(SearchValues.Create(chars));
list.Add(
typeof(T) == typeof(char) ? SearchValues.Create((char[])(object)values) :
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
typeof(T) == typeof(string) ? SearchValues.Create((string[])(object)values, StringComparison.Ordinal) :
throw new UnreachableException());

// Logically do _searchValues[index], but avoid the bounds check on accessing the array,
// and cast to the known derived sealed type to enable devirtualization.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,17 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)

// We're now left-to-right only and looking for sets.

// If there are multiple leading strings, we can search for any of them.
if (compiled)
{
if (RegexPrefixAnalyzer.FindPrefixes(root) is { Length: > 1 } prefixes)
{
LeadingPrefixes = prefixes;
FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
return;
}
}

// Build up a list of all of the sets that are a fixed distance from the start of the expression.
List<FixedDistanceSet>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter);
Debug.Assert(fixedDistanceSets is null || fixedDistanceSets.Count != 0);
Expand Down Expand Up @@ -244,6 +255,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
/// <summary>Gets the leading prefix. May be an empty string.</summary>
public string LeadingPrefix { get; } = string.Empty;

/// <summary>Gets the leading prefixes. May be an empty array.</summary>
public string[] LeadingPrefixes { get; } = Array.Empty<string>();

/// <summary>When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern.</summary>
public (char Char, string? String, int Distance) FixedDistanceLiteral { get; }

Expand Down Expand Up @@ -773,10 +787,15 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
return false;
}

// Not supported in the interpreter, but we could end up here for patterns so complex the compiler gave up on them.

case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
return true;

// Nothing special to look for. Just return true indicating this is a valid position to try to match.

default:
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch, $"Unexpected FindMode {FindMode}");
return true;
}
}
Expand Down Expand Up @@ -816,6 +835,9 @@ internal enum FindNextStartingPositionMode
/// <summary>A multi-character ordinal case-insensitive substring at the beginning of the pattern.</summary>
LeadingString_OrdinalIgnoreCase_LeftToRight,

/// <summary>Multiple leading prefix strings</summary>
LeadingStrings_LeftToRight,

/// <summary>A set starting the pattern.</summary>
LeadingSet_LeftToRight,
/// <summary>A set starting the right-to-left pattern.</summary>
Expand Down
Loading
Loading