From 13eae1c22070a26860bf9e2a4e237ab35af985f9 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Aug 2020 18:00:16 +0100 Subject: [PATCH] Use Intrinsics.X86 for SpanHelpers.IndexOfAny(char,char,char,char) --- .../src/System/SpanHelpers.Char.cs | 289 +++++++++++++----- 1 file changed, 216 insertions(+), 73 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 16b680ca53da1b..e2fdbdbfdf31e0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -692,106 +692,249 @@ public static unsafe int IndexOfAny(ref char searchSpace, char value0, char valu } [MethodImpl(MethodImplOptions.AggressiveOptimization)] - public static unsafe int IndexOfAny(ref char searchSpace, char value0, char value1, char value2, char value3, int length) + public static int IndexOfAny(ref char searchStart, char value0, char value1, char value2, char value3, int length) { Debug.Assert(length >= 0); + // If vectors are supported, we first align to Vector. + // If the searchSpan has not been fixed or pinned the GC can relocate it during the + // execution of this method, so the alignment only acts as best endeavour. + // The GC cost is likely to dominate over the misalignment that may occur after; + // so we give the GC a free hand to relocate and it is up to the caller + // whether they are operating over fixed data. + + nint offset = 0; // Use nint for arithmetic to avoid unnecessary 64->32->64 truncations + nint lengthToExamine = length; - fixed (char* pChars = &searchSpace) + if (Sse2.IsSupported) { - char* pCh = pChars; - char* pEndCh = pCh + length; - - if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + // Avx2 branch also operates on Sse2 sizes, so check is combined. + if (length >= Vector128.Count * 2) { - // Figure out how many characters to read sequentially until we are vector aligned - // This is equivalent to: - // unaligned = ((int)pCh % Unsafe.SizeOf>()) / elementsPerByte - // length = (Vector.Count - unaligned) % Vector.Count - const int elementsPerByte = sizeof(ushort) / sizeof(byte); - int unaligned = ((int)pCh & (Unsafe.SizeOf>() - 1)) / elementsPerByte; - length = (Vector.Count - unaligned) & (Vector.Count - 1); + lengthToExamine = UnalignedCountVector128(ref searchStart); } - - SequentialScan: - while (length >= 4) + } + else if (Vector.IsHardwareAccelerated) + { + if (length >= Vector.Count * 2) { - length -= 4; + lengthToExamine = UnalignedCountVector(ref searchStart); + } + } - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2 || pCh[0] == value3) - goto Found; - if (pCh[1] == value0 || pCh[1] == value1 || pCh[1] == value2 || pCh[1] == value3) - goto Found1; - if (pCh[2] == value0 || pCh[2] == value1 || pCh[2] == value2 || pCh[2] == value3) - goto Found2; - if (pCh[3] == value0 || pCh[3] == value1 || pCh[3] == value2 || pCh[3] == value3) - goto Found3; + SequentialScan: + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Unsafe.Add(ref searchStart, offset); - pCh += 4; - } + lookUp = current; + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found3; - while (length > 0) - { - length--; + offset += 4; + lengthToExamine -= 4; + } - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2 || pCh[0] == value3) - goto Found; + while (lengthToExamine > 0) + { + lookUp = Unsafe.Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found; - pCh++; - } + offset += 1; + lengthToExamine -= 1; + } - // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Vector.IsHardwareAccelerated && pCh < pEndCh) + if (offset < length) + { + if (Sse2.IsSupported || Vector.IsHardwareAccelerated) { - // Get the highest multiple of Vector.Count that is within the search space. - // That will be how many times we iterate in the loop below. - // This is equivalent to: length = Vector.Count * ((int)(pEndCh - pCh) / Vector.Count) - length = (int)((pEndCh - pCh) & ~(Vector.Count - 1)); + goto VectorizedScan; + } + else + { + Debug.Fail("Moving to Vectorized scan when not supported"); + } + } - // Get comparison Vector - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); - Vector values2 = new Vector(value2); - Vector values3 = new Vector(value3); + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; - while (length > 0) + VectorizedScan: + // We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow + // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. + if (Avx2.IsSupported) + { + lengthToExamine = GetCharVector256SpanLength(offset, length); + if (lengthToExamine > 0) + { + Debug.Assert(length - offset >= Vector256.Count); + Vector256 values0 = Vector256.Create(value0); + Vector256 values1 = Vector256.Create(value1); + Vector256 values2 = Vector256.Create(value2); + Vector256 values3 = Vector256.Create(value3); + do { - // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned - Debug.Assert(((int)pCh & (Unsafe.SizeOf>() - 1)) == 0); - Vector vData = Unsafe.Read>(pCh); - var vMatches = Vector.BitwiseOr( - Vector.BitwiseOr( - Vector.BitwiseOr(Vector.Equals(vData, values0), Vector.Equals(vData, values1)), - Vector.Equals(vData, values2)), - Vector.Equals(vData, values3)); - - if (Vector.Zero.Equals(vMatches)) + Vector256 search = LoadVector256(ref searchStart, offset); + // We preform the Or at non-Vector level as we are using the maximum number of non-preserved registers, + // and more causes them first to be pushed to stack and then popped on exit to preseve their values. + int matches = Avx2.MoveMask(Avx2.CompareEqual(values0, search).AsByte()); + // Bitwise Or to combine the flagged matches for the second, third and fourth values to our match flags + matches |= Avx2.MoveMask(Avx2.CompareEqual(values1, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values2, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values3, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) { - pCh += Vector.Count; - length -= Vector.Count; + // Zero flags set so no matches + offset += Vector256.Count; + lengthToExamine -= Vector256.Count; continue; } - // Find offset of first match - return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches); + + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } while (lengthToExamine > 0); + } + + lengthToExamine = GetCharVector128SpanLength(offset, length); + if (lengthToExamine > 0) + { + Debug.Assert(length - offset >= Vector128.Count); + + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + Vector128 values3 = Vector128.Create(value3); + + Vector128 search = LoadVector128(ref searchStart, offset); + + // We can do the ORs in vectors here as we don't need to keep the values as we are not looping + Vector128 matches0 = Sse2.CompareEqual(values0, search); + Vector128 matches1 = Sse2.CompareEqual(values1, search); + Vector128 matches2 = Sse2.CompareEqual(values2, search); + Vector128 matches3 = Sse2.CompareEqual(values3, search); + + Vector128 matches01 = Sse2.Or(matches0, matches1); + Vector128 matches23 = Sse2.Or(matches2, matches3); + + // Same method as above + int matches = Sse2.MoveMask(Sse2.Or(matches01, matches23).AsByte()); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + // Don't need to change lengthToExamine here as we don't use its current value again. + } + else + { + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); } + } - if (pCh < pEndCh) + lengthToExamine = length - offset; + if (lengthToExamine > 0) + { + goto SequentialScan; + } + } + else if (Sse2.IsSupported) + { + Debug.Assert(length - offset >= Vector128.Count); + lengthToExamine = GetCharVector128SpanLength(offset, length); + Debug.Assert(lengthToExamine > 0); + + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + Vector128 values3 = Vector128.Create(value3); + do + { + Vector128 search = LoadVector128(ref searchStart, offset); + + // Same method as above + int matches = Sse2.MoveMask(Sse2.CompareEqual(values0, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values1, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values2, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values3, search).AsByte()); + if (matches == 0) { - length = (int)(pEndCh - pCh); - goto SequentialScan; + // Zero flags set so no matches + offset += Vector128.Count; + lengthToExamine -= Vector128.Count; + continue; } + + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } while (lengthToExamine > 0); + + lengthToExamine = length - offset; + if (lengthToExamine > 0) + { + goto SequentialScan; } + } + else if (Vector.IsHardwareAccelerated) + { + Debug.Assert(length - offset >= Vector.Count); + lengthToExamine = GetCharVectorSpanLength(offset, length); + Debug.Assert(lengthToExamine > 0); - return -1; - Found3: - pCh++; - Found2: - pCh++; - Found1: - pCh++; - Found: - return (int)(pCh - pChars); + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + Vector values3 = new Vector(value3); + + do + { + Vector search = LoadVector(ref searchStart, offset); + var matches = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr(Vector.Equals(search, values0), Vector.Equals(search, values1)), + Vector.Equals(search, values2)), + Vector.Equals(search, values3)); + if (Vector.Zero.Equals(matches)) + { + offset += Vector.Count; + lengthToExamine -= Vector.Count; + continue; + } + + // Find offset of first match + return (int)(offset + LocateFirstFoundChar(matches)); + } while (lengthToExamine > 0); + + lengthToExamine = length - offset; + if (lengthToExamine > 0) + { + goto SequentialScan; + } } + + goto NotFound; } [MethodImpl(MethodImplOptions.AggressiveOptimization)]