Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Commit

Permalink
Intrinsicify SpanHelpers.IndexOfAny(char,char)
Browse files Browse the repository at this point in the history
  • Loading branch information
benaadams committed Nov 9, 2019
1 parent ad68763 commit 9424750
Showing 1 changed file with 207 additions and 68 deletions.
275 changes: 207 additions & 68 deletions src/System.Private.CoreLib/shared/System/SpanHelpers.Char.cs
Original file line number Diff line number Diff line change
Expand Up @@ -462,101 +462,240 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
}

[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public static unsafe int IndexOfAny(ref char searchSpace, char value0, char value1, int length)
public static int IndexOfAny(ref char searchStart, char value0, char value1, int length)
{
Debug.Assert(length >= 0);
// If vectors are supported, we first align to Vector.
// If the searchSpan has not been fixed or pinned the GC can relocate it during the
// execution of this method, so the alignment only acts as best endeavour.
// The GC cost is likely to dominate over the misalignment that may occur after;
// so we give the GC a free hand to relocate and it is up to the caller
// whether they are operating over fixed data.

fixed (char* pChars = &searchSpace)
nint offset = 0;
nint lengthToExamine = length;
if (Sse2.IsSupported)
{
char* pCh = pChars;
char* pEndCh = pCh + length;

if (Vector.IsHardwareAccelerated && length >= Vector<ushort>.Count * 2)
// Avx2 branch also operates on Sse2 sizes, so check is combined.
// Needs to be double length to allow us to align the data first.
if (length >= Vector128<byte>.Count * 2)
{
// Figure out how many characters to read sequentially until we are vector aligned
// This is equivalent to:
// unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / elementsPerByte
// length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count
const int elementsPerByte = sizeof(ushort) / sizeof(byte);
int unaligned = ((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) / elementsPerByte;
length = (Vector<ushort>.Count - unaligned) & (Vector<ushort>.Count - 1);
lengthToExamine = UnalignedCountVector128(ref searchStart);
}

SequentialScan:
while (length >= 4)
}
else if (Vector.IsHardwareAccelerated)
{
// Needs to be double length to allow us to align the data first.
if (length >= Vector<ushort>.Count * 2)
{
length -= 4;
lengthToExamine = UnalignedCountVector(ref searchStart);
}
}

if (pCh[0] == value0 || pCh[0] == value1)
goto Found;
if (pCh[1] == value0 || pCh[1] == value1)
goto Found1;
if (pCh[2] == value0 || pCh[2] == value1)
goto Found2;
if (pCh[3] == value0 || pCh[3] == value1)
goto Found3;
SequentialScan:
// In the non-vector case lengthToExamine is the total length.
// In the vector case lengthToExamine first aligns to Vector,
// then in a second pass after the Vector lengths is the
// remaining data that is shorter than a Vector length.
int lookUp;
while (lengthToExamine >= 4)
{
ref char current = ref Add(ref searchStart, offset);

pCh += 4;
}
lookUp = current;
if (value0 == lookUp || value1 == lookUp)
goto Found;
lookUp = Add(ref current, 1);
if (value0 == lookUp || value1 == lookUp)
goto Found1;
lookUp = Add(ref current, 2);
if (value0 == lookUp || value1 == lookUp)
goto Found2;
lookUp = Add(ref current, 3);
if (value0 == lookUp || value1 == lookUp)
goto Found3;

while (length > 0)
{
length--;
offset += 4;
lengthToExamine -= 4;
}

if (pCh[0] == value0 || pCh[0] == value1)
goto Found;
while (lengthToExamine > 0)
{
lookUp = Add(ref searchStart, offset);
if (value0 == lookUp || value1 == lookUp)
goto Found;

pCh++;
}
offset += 1;
lengthToExamine -= 1;
}

// We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow
// the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
if (Vector.IsHardwareAccelerated && pCh < pEndCh)
if (offset < length)
{
if (Sse2.IsSupported || Vector.IsHardwareAccelerated)
{
// Get the highest multiple of Vector<ushort>.Count that is within the search space.
// That will be how many times we iterate in the loop below.
// This is equivalent to: length = Vector<ushort>.Count * ((int)(pEndCh - pCh) / Vector<ushort>.Count)
length = (int)((pEndCh - pCh) & ~(Vector<ushort>.Count - 1));
goto VectorizedScan;
}
else
{
Debug.Fail("Moving to Vectorized scan when not supported");
}
}

// Get comparison Vector
Vector<ushort> values0 = new Vector<ushort>(value0);
Vector<ushort> values1 = new Vector<ushort>(value1);
NotFound:
return -1;
Found3:
return (int)(offset + 3);
Found2:
return (int)(offset + 2);
Found1:
return (int)(offset + 1);
Found:
return (int)offset;

while (length > 0)
VectorizedScan:
// We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow
// the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
if (Avx2.IsSupported)
{
lengthToExamine = GetCharVector256SpanLength(offset, length);
if (lengthToExamine > 0)
{
Vector256<ushort> values0 = Vector256.Create(value0);
Vector256<ushort> values1 = Vector256.Create(value1);
do
{
// Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned
Debug.Assert(((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) == 0);
Vector<ushort> vData = Unsafe.Read<Vector<ushort>>(pCh);
var vMatches = Vector.BitwiseOr(
Vector.Equals(vData, values0),
Vector.Equals(vData, values1));
if (Vector<ushort>.Zero.Equals(vMatches))
Debug.Assert(length - offset >= Vector256<ushort>.Count);

Vector256<ushort> search = LoadVector256(ref searchStart, offset);
// Bitwise Or to combine the flagged matches for the second value to our match flags
int matches = Avx2.MoveMask(
Avx2.Or(
Avx2.CompareEqual(values0, search),
Avx2.CompareEqual(values1, search))
.AsByte());
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
pCh += Vector<ushort>.Count;
length -= Vector<ushort>.Count;
// Zero flags set so no matches
offset += Vector256<ushort>.Count;
lengthToExamine -= Vector256<ushort>.Count;
continue;
}
// Find offset of first match
return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches);

// Find bitflag offset of first match and add to current offset,
// flags are in bytes so divide for chars
return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
} while (lengthToExamine > 0);
}

lengthToExamine = GetCharVector128SpanLength(offset, length);
if (lengthToExamine > 0)
{
Debug.Assert(length - offset >= Vector128<ushort>.Count);

Vector128<ushort> values0 = Vector128.Create(value0);
Vector128<ushort> values1 = Vector128.Create(value1);
Vector128<ushort> search = LoadVector128(ref searchStart, offset);

// Same method as above
int matches = Sse2.MoveMask(
Sse2.Or(
Sse2.CompareEqual(values0, search),
Sse2.CompareEqual(values1, search))
.AsByte());
if (matches == 0)
{
// Zero flags set so no matches
offset += Vector128<ushort>.Count;
// Don't need to change lengthToExamine here as we don't use its current value again.
}
else
{
// Find bitflag offset of first match and add to current offset,
// flags are in bytes so divide for chars
return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
}
}

if (pCh < pEndCh)
lengthToExamine = length - offset;
if (lengthToExamine > 0)
{
goto SequentialScan;
}
}
else if (Sse2.IsSupported)
{
lengthToExamine = GetCharVector128SpanLength(offset, length);
Debug.Assert(lengthToExamine > 0);

Vector128<ushort> values0 = Vector128.Create(value0);
Vector128<ushort> values1 = Vector128.Create(value1);
do
{
Debug.Assert(length - offset >= Vector128<ushort>.Count);

Vector128<ushort> search = LoadVector128(ref searchStart, offset);

// Same method as above
int matches = Sse2.MoveMask(
Sse2.Or(
Sse2.CompareEqual(values0, search),
Sse2.CompareEqual(values1, search))
.AsByte());
if (matches == 0)
{
length = (int)(pEndCh - pCh);
goto SequentialScan;
// Zero flags set so no matches
offset += Vector128<ushort>.Count;
lengthToExamine -= Vector128<ushort>.Count;
continue;
}

// Find bitflag offset of first match and add to current offset,
// flags are in bytes so divide for chars
return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
} while (lengthToExamine > 0);

lengthToExamine = length - offset;
if (lengthToExamine > 0)
{
goto SequentialScan;
}
}
else if (Vector.IsHardwareAccelerated)
{
lengthToExamine = GetCharVectorSpanLength(offset, length);
Debug.Assert(lengthToExamine > 0);

return -1;
Found3:
pCh++;
Found2:
pCh++;
Found1:
pCh++;
Found:
return (int)(pCh - pChars);
Vector<ushort> values0 = new Vector<ushort>(value0);
Vector<ushort> values1 = new Vector<ushort>(value1);
do
{
Debug.Assert(length - offset >= Vector<ushort>.Count);

Vector<ushort> search = LoadVector(ref searchStart, offset);
var matches = Vector.BitwiseOr(
Vector.Equals(search, values0),
Vector.Equals(search, values1));
if (Vector<ushort>.Zero.Equals(matches))
{
offset += Vector<ushort>.Count;
lengthToExamine -= Vector<ushort>.Count;
continue;
}

// Find offset of first match
return (int)(offset + LocateFirstFoundChar(matches));
} while (lengthToExamine > 0);

lengthToExamine = length - offset;
if (lengthToExamine > 0)
{
goto SequentialScan;
}
}

goto NotFound;
}

[MethodImpl(MethodImplOptions.AggressiveOptimization)]
Expand Down

0 comments on commit 9424750

Please sign in to comment.