Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Regex performance (mainly interpreted) #449

Merged
merged 6 commits into from
Dec 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text
_textend = textend;
_textstart = textstart;

for (int i = 0; i < _matchcount.Length; i++)
int[] matchcount = _matchcount;
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < matchcount.Length; i++)
{
_matchcount[i] = 0;
matchcount[i] = 0;
}

_balancing = false;
Expand Down Expand Up @@ -170,21 +171,23 @@ public static Match Synchronized(Match inner)
internal void AddMatch(int cap, int start, int len)
{
_matches[cap] ??= new int[2];
int[][] matches = _matches;

int capcount = _matchcount[cap];
int[] matchcount = _matchcount;
int capcount = matchcount[cap];

if (capcount * 2 + 2 > _matches[cap].Length)
if (capcount * 2 + 2 > matches[cap].Length)
{
int[] oldmatches = _matches[cap];
int[] oldmatches = matches[cap];
int[] newmatches = new int[capcount * 8];
for (int j = 0; j < capcount * 2; j++)
newmatches[j] = oldmatches[j];
_matches[cap] = newmatches;
matches[cap] = newmatches;
}

_matches[cap][capcount * 2] = start;
_matches[cap][capcount * 2 + 1] = len;
_matchcount[cap] = capcount + 1;
matches[cap][capcount * 2] = start;
matches[cap][capcount * 2 + 1] = len;
matchcount[cap] = capcount + 1;
}

/*
Expand All @@ -204,15 +207,16 @@ internal void BalanceMatch(int cap)

// first see if it is negative, and therefore is a reference to the next available
// capture group for balancing. If it is, we'll reset target to point to that capture.
if (_matches[cap][target] < 0)
target = -3 - _matches[cap][target];
int[][] matches = _matches;
if (matches[cap][target] < 0)
target = -3 - matches[cap][target];

// move back to the previous capture
target -= 2;

// if the previous capture is a reference, just copy that reference to the end. Otherwise, point to it.
if (target >= 0 && _matches[cap][target] < 0)
AddMatch(cap, _matches[cap][target], _matches[cap][target + 1]);
if (target >= 0 && matches[cap][target] < 0)
AddMatch(cap, matches[cap][target], matches[cap][target + 1]);
else
AddMatch(cap, -3 - target, -4 - target /* == -3 - (target + 1) */ );
}
Expand All @@ -230,43 +234,52 @@ internal void RemoveMatch(int cap)
/// </summary>
internal bool IsMatched(int cap)
{
return cap < _matchcount.Length && _matchcount[cap] > 0 && _matches[cap][_matchcount[cap] * 2 - 1] != (-3 + 1);
int[] matchcount = _matchcount;
return (uint)cap < (uint)matchcount.Length && matchcount[cap] > 0 && _matches[cap][matchcount[cap] * 2 - 1] != (-3 + 1);
}

/// <summary>
/// Returns the index of the last specified matched group by capnum
/// </summary>
internal int MatchIndex(int cap)
{
int i = _matches[cap][_matchcount[cap] * 2 - 2];
int[][] matches = _matches;

int i = matches[cap][_matchcount[cap] * 2 - 2];
if (i >= 0)
return i;

return _matches[cap][-3 - i];
return matches[cap][-3 - i];
}

/// <summary>
/// Returns the length of the last specified matched group by capnum
/// </summary>
internal int MatchLength(int cap)
{
int i = _matches[cap][_matchcount[cap] * 2 - 1];
int[][] matches = _matches;

int i = matches[cap][_matchcount[cap] * 2 - 1];
if (i >= 0)
return i;

return _matches[cap][-3 - i];
return matches[cap][-3 - i];
}

/// <summary>
/// Tidy the match so that it can be used as an immutable result
/// </summary>
internal void Tidy(int textpos)
{
int[] interval = _matches[0];
int[][] matches = _matches;

int[] interval = matches[0];
Index = interval[0];
Length = interval[1];
_textpos = textpos;
_capcount = _matchcount[0];

int[] matchcount = _matchcount;
_capcount = matchcount[0];

if (_balancing)
{
Expand All @@ -276,13 +289,13 @@ internal void Tidy(int textpos)
// until we find a balance captures. Then we check each subsequent entry. If it's a balance
// capture (it's negative), we decrement j. If it's a real capture, we increment j and copy
// it down to the last free position.
for (int cap = 0; cap < _matchcount.Length; cap++)
for (int cap = 0; cap < matchcount.Length; cap++)
{
int limit;
int[] matcharray;

limit = _matchcount[cap] * 2;
matcharray = _matches[cap];
limit = matchcount[cap] * 2;
matcharray = matches[cap];

int i = 0;
int j;
Expand Down Expand Up @@ -310,7 +323,7 @@ internal void Tidy(int textpos)
}
}

_matchcount[cap] = j / 2;
matchcount[cap] = j / 2;
}

_balancing = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,23 +205,6 @@ public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, C
}
}

private bool MatchPattern(string text, int index)
{
if (CaseInsensitive)
{
if (text.Length - index < Pattern.Length)
{
return false;
}

return (0 == string.Compare(Pattern, 0, text, index, Pattern.Length, CaseInsensitive, _culture));
}
else
{
return (0 == string.CompareOrdinal(Pattern, 0, text, index, Pattern.Length));
}
}

/// <summary>
/// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
/// </summary>
Expand All @@ -231,16 +214,21 @@ public bool IsMatch(string text, int index, int beglimit, int endlimit)
{
if (index < beglimit || endlimit - index < Pattern.Length)
return false;

return MatchPattern(text, index);
}
else
{
if (index > endlimit || index - beglimit < Pattern.Length)
return false;

return MatchPattern(text, index - Pattern.Length);
index -= Pattern.Length;
}

if (CaseInsensitive)
{
return string.Compare(Pattern, 0, text, index, Pattern.Length, ignoreCase: true, _culture) == 0;
}

return Pattern.AsSpan().SequenceEqual(text.AsSpan(index, Pattern.Length));
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Threading;

namespace System.Text.RegularExpressions
{
Expand Down Expand Up @@ -732,7 +733,7 @@ public static string ConvertOldStringsToClass(string set, string category)
/// </summary>
public static char SingletonChar(string set)
{
Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
Debug.Assert(IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
return set[SetStartIndex];
}

Expand All @@ -747,14 +748,6 @@ public static bool IsEmpty(string charClass) =>
!IsNegated(charClass) &&
!IsSubtraction(charClass);

/// <summary><c>true</c> if the set contains a single character only</summary>
public static bool IsSingleton(string set) =>
set[CategoryLengthIndex] == 0 &&
set[SetLengthIndex] == 2 &&
!IsNegated(set) &&
!IsSubtraction(set) &&
(set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);

public static bool IsSingletonInverse(string set) =>
set[CategoryLengthIndex] == 0 &&
set[SetLengthIndex] == 2 &&
Expand Down Expand Up @@ -823,6 +816,68 @@ public static bool IsWordChar(char ch)
}
}

public static bool CharInClass(char ch, string set, ref int[]? asciiResultCache)
{
// The int[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit ("known") in the pair
// says whether the second bit ("value") in the pair has already been computed. Once a value is computed, it's never
// changed, so since Int32s are written/read atomically, we can trust the value bit if we see that the known bit
// has been set. If the known bit hasn't been set, then we proceed to look it up, and then swap in the result.
const int CacheArrayLength = 8;
Debug.Assert(asciiResultCache is null || asciiResultCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters");

if (ch < 128)
{
// Lazily-initialize the cache for this set.
if (asciiResultCache is null)
{
Interlocked.CompareExchange(ref asciiResultCache, new int[CacheArrayLength], null);
}
stephentoub marked this conversation as resolved.
Show resolved Hide resolved

// Determine which int in the lookup array contains the known and value bits for this character,
// and compute their bit numbers.
ref int slot = ref asciiResultCache[ch >> 4];
int knownBit = 1 << ((ch & 0xF) << 1);
lpereira marked this conversation as resolved.
Show resolved Hide resolved
int valueBit = knownBit << 1;

// If the value for this bit has already been computed, use it.
int current = slot;
if ((current & knownBit) != 0)
{
return (current & valueBit) != 0;
}

// (After warm-up, we should find ourselves rarely getting here.)

// Otherwise, compute it normally.
bool isInClass = CharInClass(ch, set);

// Determine which bits to write back to the array.
int bitsToSet = knownBit;
if (isInClass)
{
bitsToSet |= valueBit;
}

// "or" the bits back in a thread-safe manner.
while (true)
{
int oldValue = Interlocked.CompareExchange(ref slot, current | bitsToSet, current);
if (oldValue == current)
{
break;
}

current = oldValue;
}

// Return the computed value.
return isInClass;
}

// Non-ASCII. Fall back to computing the answer.
return CharInClassRecursive(ch, set, 0);
}

public static bool CharInClass(char ch, string set) =>
CharInClassRecursive(ch, set, 0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;

namespace System.Text.RegularExpressions
{
Expand Down Expand Up @@ -91,10 +90,12 @@ internal sealed class RegexCode

public readonly int[] Codes; // the code
public readonly string[] Strings; // the string/set table
public readonly int[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings
public readonly int TrackCount; // how many instructions use backtracking
public readonly Hashtable? Caps; // mapping of user group numbers -> impl group slots
public readonly int CapSize; // number of impl group slots
public readonly RegexPrefix? FCPrefix; // the set of candidate first characters (may be null)
public int[]? FCPrefixAsciiLookup; // the ASCII lookup table optimization for the set of candidate first characters if there are any
public readonly RegexBoyerMoore? BMPrefix; // the fixed prefix string as a Boyer-Moore machine (may be null)
public readonly int Anchors; // the set of zero-length start anchors (RegexFCD.Bol, etc)
public readonly bool RightToLeft; // true if right to left
Expand All @@ -109,6 +110,7 @@ public RegexCode(int[] codes, List<string> stringlist, int trackcount,

Codes = codes;
Strings = stringlist.ToArray();
StringsAsciiLookup = new int[Strings.Length][];
TrackCount = trackcount;
Caps = caps;
CapSize = capsize;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1163,29 +1163,14 @@ protected void GenerateFindFirstChar()
CallToLower();
}

if (!RegexCharClass.IsSingleton(_fcPrefix.GetValueOrDefault().Prefix))
{
EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, charInClassV);
BrtrueFar(l2);
}
else
{
Ldc(RegexCharClass.SingletonChar(_fcPrefix.GetValueOrDefault().Prefix));
Beq(l2);
}
EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, charInClassV);
BrtrueFar(l2);

MarkLabel(l5);

Ldloc(cV);
Ldc(0);
if (!RegexCharClass.IsSingleton(_fcPrefix.GetValueOrDefault().Prefix))
{
BgtFar(l1);
}
else
{
Bgt(l1);
}
BgtFar(l1);

Ldc(0);
BrFar(l3);
Expand Down
Loading