diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index e4d61cc09464a..41da05793059e 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -2457,4 +2457,7 @@ + + + diff --git a/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs b/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs index 5ae66529eb20d..fd6229f70c1b0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs +++ b/src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs @@ -1617,6 +1617,7 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(span)), Unsafe.Add(ref valueRef, 2), span.Length); +#if !MONO // We don't have a mono overload for 4 values case 4: return SpanHelpers.LastIndexOfAnyValueType( ref spanRef, @@ -1625,6 +1626,7 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(span)), Unsafe.Add(ref valueRef, 2), Unsafe.Add(ref valueRef, 3), span.Length); +#endif default: return LastIndexOfAnyProbabilistic(ref Unsafe.As(ref spanRef), span.Length, ref Unsafe.As(ref valueRef), values.Length); diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Mono.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Mono.cs new file mode 100644 index 0000000000000..d6a7f09e7465b --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Mono.cs @@ -0,0 +1,2697 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; + +namespace System +{ + internal static partial class SpanHelpers // helpers used by Mono + { + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static unsafe int IndexOfValueType(ref byte searchSpace, byte value, int length) + { + Debug.Assert(length >= 0); + + uint uValue = value; // Use uint for comparisons to avoid unnecessary 8->32 extensions + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Vector128.IsHardwareAccelerated) + { + // Avx2 branch also operates on Sse2 sizes, so check is combined. + if (length >= Vector128.Count * 2) + { + lengthToExamine = UnalignedCountVector128(ref searchSpace); + } + } + else if (Vector.IsHardwareAccelerated) + { + if (length >= Vector.Count * 2) + { + lengthToExamine = UnalignedCountVector(ref searchSpace); + } + } + SequentialScan: + while (lengthToExamine >= 8) + { + lengthToExamine -= 8; + + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset)) + goto Found; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 1)) + goto Found1; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 2)) + goto Found2; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 3)) + goto Found3; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 4)) + goto Found4; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 5)) + goto Found5; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 6)) + goto Found6; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 7)) + goto Found7; + + offset += 8; + } + + if (lengthToExamine >= 4) + { + lengthToExamine -= 4; + + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset)) + goto Found; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 1)) + goto Found1; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 2)) + goto Found2; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 3)) + goto Found3; + + offset += 4; + } + + while (lengthToExamine > 0) + { + lengthToExamine -= 1; + + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset)) + goto Found; + + offset += 1; + } + + // We get past SequentialScan only if IsHardwareAccelerated is true; and remain length is greater than Vector length. + // However, we still have the redundant check to allow the JIT to see that the code is unreachable and eliminate it when the platform does not + // have hardware accelerated. After processing Vector lengths we return to SequentialScan to finish any remaining. + if (Vector256.IsHardwareAccelerated) + { + if (offset < (nuint)(uint)length) + { + if ((((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256.Count - 1)) != 0) + { + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // with no upper bound e.g. String.strlen. + // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. + // This ensures we do not fault across memory pages while searching for an end of string. + Vector128 values = Vector128.Create(value); + Vector128 search = Vector128.LoadUnsafe(ref searchSpace, offset); + + // Same method as below + uint matches = Vector128.Equals(values, search).ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } + } + + lengthToExamine = GetByteVector256SpanLength(offset, length); + if (lengthToExamine > offset) + { + Vector256 values = Vector256.Create(value); + do + { + Vector256 search = Vector256.LoadUnsafe(ref searchSpace, offset); + uint matches = Vector256.Equals(values, search).ExtractMostSignificantBits(); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector256.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } while (lengthToExamine > offset); + } + + lengthToExamine = GetByteVector128SpanLength(offset, length); + if (lengthToExamine > offset) + { + Vector128 values = Vector128.Create(value); + Vector128 search = Vector128.LoadUnsafe(ref searchSpace, offset); + + // Same method as above + uint matches = Vector128.Equals(values, search).ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } + } + + if (offset < (nuint)(uint)length) + { + lengthToExamine = ((nuint)(uint)length - offset); + goto SequentialScan; + } + } + } + else if (Vector128.IsHardwareAccelerated) + { + if (offset < (nuint)(uint)length) + { + lengthToExamine = GetByteVector128SpanLength(offset, length); + + Vector128 values = Vector128.Create(value); + while (lengthToExamine > offset) + { + Vector128 search = Vector128.LoadUnsafe(ref searchSpace, offset); + + // Same method as above + Vector128 compareResult = Vector128.Equals(values, search); + if (compareResult == Vector128.Zero) + { + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset + uint matches = compareResult.ExtractMostSignificantBits(); + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } + + if (offset < (nuint)(uint)length) + { + lengthToExamine = ((nuint)(uint)length - offset); + goto SequentialScan; + } + } + } + else if (Vector.IsHardwareAccelerated) + { + if (offset < (nuint)(uint)length) + { + lengthToExamine = GetByteVectorSpanLength(offset, length); + + Vector values = new Vector(value); + + while (lengthToExamine > offset) + { + var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); + if (Vector.Zero.Equals(matches)) + { + offset += (nuint)Vector.Count; + continue; + } + + // Find offset of first match and add to current offset + return (int)offset + LocateFirstFoundByte(matches); + } + + if (offset < (nuint)(uint)length) + { + lengthToExamine = ((nuint)(uint)length - offset); + goto SequentialScan; + } + } + } + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe int IndexOfValueType(ref short searchSpace, short value, int length) + => IndexOfChar(ref Unsafe.As(ref searchSpace), Unsafe.As(ref value), length); + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static unsafe int IndexOfChar(ref char searchSpace, char value, int length) + { + Debug.Assert(length >= 0); + + nint offset = 0; + nint lengthToExamine = length; + + if (((int)Unsafe.AsPointer(ref searchSpace) & 1) != 0) + { + // Input isn't char aligned, we won't be able to align it to a Vector + } + else if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) + { + // Avx2 branch also operates on Sse2 sizes, so check is combined. + // Needs to be double length to allow us to align the data first. + if (length >= Vector128.Count * 2) + { + lengthToExamine = UnalignedCountVector128(ref searchSpace); + } + } + else if (Vector.IsHardwareAccelerated) + { + // Needs to be double length to allow us to align the data first. + if (length >= Vector.Count * 2) + { + lengthToExamine = UnalignedCountVector(ref searchSpace); + } + } + + SequentialScan: + // In the non-vector case lengthToExamine is the total length. + // In the vector case lengthToExamine first aligns to Vector, + // then in a second pass after the Vector lengths is the + // remaining data that is shorter than a Vector length. + while (lengthToExamine >= 4) + { + ref char current = ref Unsafe.Add(ref searchSpace, offset); + + if (value == current) + goto Found; + if (value == Unsafe.Add(ref current, 1)) + goto Found1; + if (value == Unsafe.Add(ref current, 2)) + goto Found2; + if (value == Unsafe.Add(ref current, 3)) + goto Found3; + + offset += 4; + lengthToExamine -= 4; + } + + while (lengthToExamine > 0) + { + if (value == Unsafe.Add(ref searchSpace, offset)) + goto Found; + + offset++; + lengthToExamine--; + } + + // We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow + // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. + if (Avx2.IsSupported) + { + if (offset < length) + { + Debug.Assert(length - offset >= Vector128.Count); + if (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (nint)offset)) & (nint)(Vector256.Count - 1)) != 0) + { + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, + // before moving to processing Vector256. + + // If the input searchSpan has been fixed or pinned, this ensures we do not fault across memory pages + // while searching for an end of string. Specifically that this assumes that the length is either correct + // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an + // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found, + // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather + // than ever causing an AV. + + // If the searchSpan has not been fixed or pinned the GC can relocate it during the execution of this + // method, so the alignment only acts as best endeavour. The GC cost is likely to dominate over + // the misalignment that may occur after; to we default to giving the GC a free hand to relocate and + // its up to the caller whether they are operating over fixed data. + Vector128 values = Vector128.Create((ushort)value); + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as below + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } + + lengthToExamine = GetCharVector256SpanLength(offset, length); + if (lengthToExamine > 0) + { + Vector256 values = Vector256.Create((ushort)value); + do + { + Debug.Assert(lengthToExamine >= Vector256.Count); + + Vector256 search = LoadVector256(ref searchSpace, offset); + int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector256.Count; + lengthToExamine -= Vector256.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } while (lengthToExamine > 0); + } + + lengthToExamine = GetCharVector128SpanLength(offset, length); + if (lengthToExamine > 0) + { + Debug.Assert(lengthToExamine >= Vector128.Count); + + Vector128 values = Vector128.Create((ushort)value); + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as above + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + // Don't need to change lengthToExamine here as we don't use its current value again. + } + else + { + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } + + if (offset < length) + { + lengthToExamine = length - offset; + goto SequentialScan; + } + } + } + else if (Sse2.IsSupported) + { + if (offset < length) + { + Debug.Assert(length - offset >= Vector128.Count); + + lengthToExamine = GetCharVector128SpanLength(offset, length); + if (lengthToExamine > 0) + { + Vector128 values = Vector128.Create((ushort)value); + do + { + Debug.Assert(lengthToExamine >= Vector128.Count); + + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as above + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + lengthToExamine -= Vector128.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } while (lengthToExamine > 0); + } + + if (offset < length) + { + lengthToExamine = length - offset; + goto SequentialScan; + } + } + } + else if (AdvSimd.Arm64.IsSupported) + { + if (offset < length) + { + Debug.Assert(length - offset >= Vector128.Count); + + lengthToExamine = GetCharVector128SpanLength(offset, length); + if (lengthToExamine > 0) + { + Vector128 values = Vector128.Create((ushort)value); + do + { + Debug.Assert(lengthToExamine >= Vector128.Count); + + Vector128 search = LoadVector128(ref searchSpace, offset); + Vector128 compareResult = AdvSimd.CompareEqual(values, search); + + if (compareResult == Vector128.Zero) + { + offset += Vector128.Count; + lengthToExamine -= Vector128.Count; + continue; + } + + return (int)(offset + FindFirstMatchedLane(compareResult)); + } while (lengthToExamine > 0); + } + + if (offset < length) + { + lengthToExamine = length - offset; + goto SequentialScan; + } + } + } + else if (Vector.IsHardwareAccelerated) + { + if (offset < length) + { + Debug.Assert(length - offset >= Vector.Count); + + lengthToExamine = GetCharVectorSpanLength(offset, length); + + if (lengthToExamine > 0) + { + Vector values = new Vector((ushort)value); + do + { + Debug.Assert(lengthToExamine >= Vector.Count); + + var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); + if (Vector.Zero.Equals(matches)) + { + offset += Vector.Count; + lengthToExamine -= Vector.Count; + continue; + } + + // Find offset of first match + return (int)(offset + LocateFirstFoundChar(matches)); + } while (lengthToExamine > 0); + } + + if (offset < length) + { + lengthToExamine = length - offset; + goto SequentialScan; + } + } + } + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)(offset); + } + + internal static unsafe int IndexOfValueType(ref T searchSpace, T value, int length) where T : struct, IEquatable + { + Debug.Assert(length >= 0); + + nint index = 0; // Use nint for arithmetic to avoid unnecessary 64->32->64 truncations + if (Vector.IsHardwareAccelerated && Vector.IsSupported && (Vector.Count * 2) <= length) + { + Vector valueVector = new Vector(value); + Vector compareVector; + Vector matchVector; + if ((uint)length % (uint)Vector.Count != 0) + { + // Number of elements is not a multiple of Vector.Count, so do one + // check and shift only enough for the remaining set to be a multiple + // of Vector.Count. + compareVector = Unsafe.As>(ref Unsafe.Add(ref searchSpace, index)); + matchVector = Vector.Equals(valueVector, compareVector); + if (matchVector != Vector.Zero) + { + goto VectorMatch; + } + index += length % Vector.Count; + length -= length % Vector.Count; + } + while (length > 0) + { + compareVector = Unsafe.As>(ref Unsafe.Add(ref searchSpace, index)); + matchVector = Vector.Equals(valueVector, compareVector); + if (matchVector != Vector.Zero) + { + goto VectorMatch; + } + index += Vector.Count; + length -= Vector.Count; + } + goto NotFound; + VectorMatch: + for (int i = 0; i < Vector.Count; i++) + if (compareVector[i].Equals(value)) + return (int)(index + i); + } + + while (length >= 8) + { + if (value.Equals(Unsafe.Add(ref searchSpace, index))) + goto Found; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 1))) + goto Found1; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 2))) + goto Found2; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 3))) + goto Found3; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 4))) + goto Found4; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 5))) + goto Found5; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 6))) + goto Found6; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 7))) + goto Found7; + + length -= 8; + index += 8; + } + + while (length >= 4) + { + if (value.Equals(Unsafe.Add(ref searchSpace, index))) + goto Found; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 1))) + goto Found1; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 2))) + goto Found2; + if (value.Equals(Unsafe.Add(ref searchSpace, index + 3))) + goto Found3; + + length -= 4; + index += 4; + } + + while (length > 0) + { + if (value.Equals(Unsafe.Add(ref searchSpace, index))) + goto Found; + + index += 1; + length--; + } + NotFound: + return -1; + + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)index; + Found1: + return (int)(index + 1); + Found2: + return (int)(index + 2); + Found3: + return (int)(index + 3); + Found4: + return (int)(index + 4); + Found5: + return (int)(index + 5); + Found6: + return (int)(index + 6); + Found7: + return (int)(index + 7); + } + + internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value0, int length) where T : struct, IEquatable + { + Debug.Assert(length >= 0, "Expected non-negative length"); + Debug.Assert(value0 is byte or short or int or long, "Expected caller to normalize to one of these types"); + + if (!Vector128.IsHardwareAccelerated || length < Vector128.Count) + { + for (int i = 0; i < length; i++) + { + if (!Unsafe.Add(ref searchSpace, i).Equals(value0)) + { + return i; + } + } + } + else + { + Vector128 notEquals, value0Vector = Vector128.Create(value0); + ref T current = ref searchSpace; + ref T oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector128.Count); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + notEquals = ~Vector128.Equals(value0Vector, Vector128.LoadUnsafe(ref current)); + if (notEquals != Vector128.Zero) + { + return ComputeIndex(ref searchSpace, ref current, notEquals); + } + + current = ref Unsafe.Add(ref current, Vector128.Count); + } + while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector128.Count != 0) + { + notEquals = ~Vector128.Equals(value0Vector, Vector128.LoadUnsafe(ref oneVectorAwayFromEnd)); + if (notEquals != Vector128.Zero) + { + return ComputeIndex(ref searchSpace, ref oneVectorAwayFromEnd, notEquals); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static int ComputeIndex(ref T searchSpace, ref T current, Vector128 notEquals) + { + uint notEqualsElements = notEquals.ExtractMostSignificantBits(); + int index = BitOperations.TrailingZeroCount(notEqualsElements); + return index + (int)(Unsafe.ByteOffset(ref searchSpace, ref current) / Unsafe.SizeOf()); + } + } + + return -1; + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static int LastIndexOfValueType(ref byte searchSpace, byte value, int length) + { + Debug.Assert(length >= 0); + + uint uValue = value; // Use uint for comparisons to avoid unnecessary 8->32 extensions + nuint offset = (nuint)(uint)length; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + { + lengthToExamine = UnalignedCountVectorFromEnd(ref searchSpace, length); + } + SequentialScan: + while (lengthToExamine >= 8) + { + lengthToExamine -= 8; + offset -= 8; + + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 7)) + goto Found7; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 6)) + goto Found6; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 5)) + goto Found5; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 4)) + goto Found4; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 3)) + goto Found3; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 2)) + goto Found2; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 1)) + goto Found1; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset)) + goto Found; + } + + if (lengthToExamine >= 4) + { + lengthToExamine -= 4; + offset -= 4; + + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 3)) + goto Found3; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 2)) + goto Found2; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset + 1)) + goto Found1; + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset)) + goto Found; + } + + while (lengthToExamine > 0) + { + lengthToExamine -= 1; + offset -= 1; + + if (uValue == Unsafe.AddByteOffset(ref searchSpace, offset)) + goto Found; + } + + if (Vector.IsHardwareAccelerated && (offset > 0)) + { + lengthToExamine = (offset & (nuint)~(Vector.Count - 1)); + + Vector values = new Vector(value); + + while (lengthToExamine > (nuint)(Vector.Count - 1)) + { + var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset - (nuint)Vector.Count)); + if (Vector.Zero.Equals(matches)) + { + offset -= (nuint)Vector.Count; + lengthToExamine -= (nuint)Vector.Count; + continue; + } + + // Find offset of first match and add to current offset + return (int)(offset) - Vector.Count + LocateLastFoundByte(matches); + } + if (offset > 0) + { + lengthToExamine = offset; + goto SequentialScan; + } + } + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe int LastIndexOfValueType(ref short searchSpace, short value, int length) + => LastIndexOfValueType(ref Unsafe.As(ref searchSpace), Unsafe.As(ref value), length); + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static unsafe int LastIndexOfValueType(ref char searchSpace, char value, int length) + { + Debug.Assert(length >= 0); + + fixed (char* pChars = &searchSpace) + { + char* pCh = pChars + length; + char* pEndCh = pChars; + + if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + { + // Figure out how many characters to read sequentially from the end until we are vector aligned + // This is equivalent to: length = ((int)pCh % Unsafe.SizeOf>()) / elementsPerByte + const int elementsPerByte = sizeof(ushort) / sizeof(byte); + length = ((int)pCh & (Unsafe.SizeOf>() - 1)) / elementsPerByte; + } + + SequentialScan: + while (length >= 4) + { + length -= 4; + pCh -= 4; + + if (*(pCh + 3) == value) + goto Found3; + if (*(pCh + 2) == value) + goto Found2; + if (*(pCh + 1) == value) + goto Found1; + if (*pCh == value) + goto Found; + } + + while (length > 0) + { + length--; + pCh--; + + if (*pCh == value) + goto Found; + } + + // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow + // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. + if (Vector.IsHardwareAccelerated && pCh > pEndCh) + { + // Get the highest multiple of Vector.Count that is within the search space. + // That will be how many times we iterate in the loop below. + // This is equivalent to: length = Vector.Count * ((int)(pCh - pEndCh) / Vector.Count) + length = (int)((pCh - pEndCh) & ~(Vector.Count - 1)); + + // Get comparison Vector + Vector vComparison = new Vector(value); + + while (length > 0) + { + char* pStart = pCh - Vector.Count; + // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh (and hence pSart) is always vector aligned + Debug.Assert(((int)pStart & (Unsafe.SizeOf>() - 1)) == 0); + Vector vMatches = Vector.Equals(vComparison, Unsafe.Read>(pStart)); + if (Vector.Zero.Equals(vMatches)) + { + pCh -= Vector.Count; + length -= Vector.Count; + continue; + } + // Find offset of last match + return (int)(pStart - pEndCh) + LocateLastFoundChar(vMatches); + } + + if (pCh > pEndCh) + { + length = (int)(pCh - pEndCh); + goto SequentialScan; + } + } + + return -1; + Found: + return (int)(pCh - pEndCh); + Found1: + return (int)(pCh - pEndCh) + 1; + Found2: + return (int)(pCh - pEndCh) + 2; + Found3: + return (int)(pCh - pEndCh) + 3; + } + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static unsafe int LastIndexOfValueType(ref T searchSpace, T value, int length) where T : IEquatable? + => LastIndexOf(ref searchSpace, value, length); + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static int IndexOfAnyValueType(ref byte searchSpace, byte value0, byte value1, int length) + { + Debug.Assert(length >= 0); + + uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) + { + // Avx2 branch also operates on Sse2 sizes, so check is combined. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) + { + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of the method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + else if (Vector.IsHardwareAccelerated) + { + // Calculate lengthToExamine here for test, as it is used later + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) + { + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + + uint lookUp; + while (lengthToExamine >= 8) + { + lengthToExamine -= 8; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found3; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found4; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found5; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found6; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found7; + + offset += 8; + } + + if (lengthToExamine >= 4) + { + lengthToExamine -= 4; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found3; + + offset += 4; + } + + while (lengthToExamine > 0) + { + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found; + + offset += 1; + lengthToExamine -= 1; + } + + NotFound: + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) + { + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create(value0); + Vector256 values1 = Vector256.Create(value1); + + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchSpace, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search))); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } + + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search))); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchSpace, offset); + + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector128.Count; + continue; + } + + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search))); + if (matches == 0) + { + // None matched + goto NotFound; + } + } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset + offset += (nuint)BitOperations.TrailingZeroCount(matches); + goto Found; + } + else if (AdvSimd.Arm64.IsSupported) + { + Vector128 search; + Vector128 matches; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchSpace, offset); + + matches = AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)); + + if (matches == Vector128.Zero) + { + offset += (nuint)Vector128.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset + offset += FindFirstMatchedLane(matches); + + goto Found; + } + + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)); + + if (matches == Vector128.Zero) + { + // None matched + goto NotFound; + } + + // Find bitflag offset of first match and add to current offset + offset += FindFirstMatchedLane(matches); + + goto Found; + } + else if (Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchSpace, offset); + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) + { + // None matched + offset += (nuint)Vector.Count; + continue; + } + + goto VectorMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + VectorMatch: + offset += (nuint)LocateFirstFoundByte(search); + goto Found; + } + + Debug.Fail("Unreachable"); + goto NotFound; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe int IndexOfAnyValueType(ref short searchSpace, short value0, short value1, int length) + => IndexOfAnyChar(ref Unsafe.As(ref searchSpace), Unsafe.As(ref value0), Unsafe.As(ref value1), length); + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static unsafe int IndexOfAnyChar(ref char searchStart, char value0, char value1, int length) + { + Debug.Assert(length >= 0); + + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Sse2.IsSupported) + { + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) + { + // >= Sse2 intrinsics are supported and length is enough to use them, so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + else if (Vector.IsHardwareAccelerated) + { + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) + { + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto VectorCompare; + } + } + + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchStart, offset); + + lookUp = current; + if (value0 == lookUp || value1 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp) + goto Found3; + + offset += 4; + lengthToExamine -= 4; + } + + while (lengthToExamine > 0) + { + lookUp = Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp) + goto Found; + + offset += 1; + lengthToExamine -= 1; + } + + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) + { + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create((ushort)value0); + Vector256 values1 = Vector256.Create((ushort)value1); + + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchStart, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } + + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)) + .AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create((ushort)value0); + Vector128 values1 = Vector128.Create((ushort)value1); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchStart, offset); + + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector128.Count; + continue; + } + + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)) + .AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset, + // flags are in bytes so divide by 2 for chars (shift right by 1) + offset += (nuint)(uint)BitOperations.TrailingZeroCount(matches) >> 1; + goto Found; + } + + VectorCompare: + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (!Sse2.IsSupported && Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchStart, offset); + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) + { + // None matched + offset += (nuint)Vector.Count; + continue; + } + + goto VectorMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchStart, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + VectorMatch: + offset += (nuint)(uint)LocateFirstFoundChar(search); + goto Found; + } + + Debug.Fail("Unreachable"); + goto NotFound; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, int length) + => IndexOfAnyExcept(ref searchSpace, value0, value1, length); + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static int IndexOfAnyValueType(ref byte searchSpace, byte value0, byte value1, byte value2, int length) + { + Debug.Assert(length >= 0); + + uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue2 = value2; // Use uint for comparisons to avoid unnecessary 8->32 extensions + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) + { + // Avx2 branch also operates on Sse2 sizes, so check is combined. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) + { + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of the method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + else if (Vector.IsHardwareAccelerated) + { + // Calculate lengthToExamine here for test, as it is used later + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) + { + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + + uint lookUp; + while (lengthToExamine >= 8) + { + lengthToExamine -= 8; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found3; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found4; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found5; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found6; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found7; + + offset += 8; + } + + if (lengthToExamine >= 4) + { + lengthToExamine -= 4; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found3; + + offset += 4; + } + + while (lengthToExamine > 0) + { + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found; + + offset += 1; + lengthToExamine -= 1; + } + + NotFound: + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) + { + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create(value0); + Vector256 values1 = Vector256.Create(value1); + Vector256 values2 = Vector256.Create(value2); + + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchSpace, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search))); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } + + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search))); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchSpace, offset); + + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search))); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector128.Count; + continue; + } + + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search))); + if (matches == 0) + { + // None matched + goto NotFound; + } + } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset + offset += (nuint)BitOperations.TrailingZeroCount(matches); + goto Found; + } + else if (AdvSimd.Arm64.IsSupported) + { + Vector128 search; + Vector128 matches; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchSpace, offset); + + matches = AdvSimd.Or( + AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)), + AdvSimd.CompareEqual(values2, search)); + + if (matches == Vector128.Zero) + { + offset += (nuint)Vector128.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset + offset += FindFirstMatchedLane(matches); + + goto Found; + } + + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = AdvSimd.Or( + AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)), + AdvSimd.CompareEqual(values2, search)); + + if (matches == Vector128.Zero) + { + // None matched + goto NotFound; + } + + // Find bitflag offset of first match and add to current offset + offset += FindFirstMatchedLane(matches); + + goto Found; + } + else if (Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchSpace, offset); + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) + { + // None matched + offset += (nuint)Vector.Count; + continue; + } + + goto VectorMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + VectorMatch: + offset += (nuint)LocateFirstFoundByte(search); + goto Found; + } + + Debug.Fail("Unreachable"); + goto NotFound; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe int IndexOfAnyValueType(ref short searchSpace, short value0, short value1, short value2, int length) + => IndexOfAnyValueType( + ref Unsafe.As(ref searchSpace), + Unsafe.As(ref value0), + Unsafe.As(ref value1), + Unsafe.As(ref value2), + length); + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static unsafe int IndexOfAnyValueType(ref char searchStart, char value0, char value1, char value2, int length) + { + Debug.Assert(length >= 0); + + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Sse2.IsSupported) + { + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) + { + // >= Sse2 intrinsics are supported and length is enough to use them, so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + else if (Vector.IsHardwareAccelerated) + { + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) + { + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto VectorCompare; + } + } + + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchStart, offset); + + lookUp = current; + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found3; + + offset += 4; + lengthToExamine -= 4; + } + + while (lengthToExamine > 0) + { + lookUp = Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found; + + offset += 1; + lengthToExamine -= 1; + } + + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) + { + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create((ushort)value0); + Vector256 values1 = Vector256.Create((ushort)value1); + Vector256 values2 = Vector256.Create((ushort)value2); + + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchStart, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } + + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search)) + .AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create((ushort)value0); + Vector128 values1 = Vector128.Create((ushort)value1); + Vector128 values2 = Vector128.Create((ushort)value2); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchStart, offset); + + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector128.Count; + continue; + } + + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search)) + .AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset, + // flags are in bytes so divide by 2 for chars (shift right by 1) + offset += (nuint)(uint)BitOperations.TrailingZeroCount(matches) >> 1; + goto Found; + } + + VectorCompare: + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (!Sse2.IsSupported && Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchStart, offset); + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) + { + // None matched + offset += (nuint)Vector.Count; + continue; + } + + goto VectorMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchStart, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + VectorMatch: + offset += (nuint)(uint)LocateFirstFoundChar(search); + goto Found; + } + + Debug.Fail("Unreachable"); + goto NotFound; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, int length) + => IndexOfAnyExcept(ref searchSpace, value0, value1, value2, length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe int IndexOfAnyValueType(ref short searchSpace, short value0, short value1, short value2, short value3, int length) + => IndexOfAnyValueType( + ref Unsafe.As(ref searchSpace), + Unsafe.As(ref value0), + Unsafe.As(ref value1), + Unsafe.As(ref value2), + Unsafe.As(ref value3), + length); + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + internal static unsafe int IndexOfAnyValueType(ref char searchStart, char value0, char value1, char value2, char value3, int length) + { + Debug.Assert(length >= 0); + + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Sse2.IsSupported) + { + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) + { + // >= Sse2 intrinsics are supported and length is enough to use them, so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + else if (Vector.IsHardwareAccelerated) + { + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) + { + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto VectorCompare; + } + } + + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchStart, offset); + + lookUp = current; + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found3; + + offset += 4; + lengthToExamine -= 4; + } + + while (lengthToExamine > 0) + { + lookUp = Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found; + + offset += 1; + lengthToExamine -= 1; + } + + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) + { + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create((ushort)value0); + Vector256 values1 = Vector256.Create((ushort)value1); + Vector256 values2 = Vector256.Create((ushort)value2); + Vector256 values3 = Vector256.Create((ushort)value3); + + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchStart, offset); + // We preform the Or at non-Vector level as we are using the maximum number of non-preserved registers, + // and more causes them first to be pushed to stack and then popped on exit to preseve their values. + matches = Avx2.MoveMask(Avx2.CompareEqual(values0, search).AsByte()); + // Bitwise Or to combine the flagged matches for the second, third and fourth values to our match flags + matches |= Avx2.MoveMask(Avx2.CompareEqual(values1, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values2, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values3, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } + + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask(Avx2.CompareEqual(values0, search).AsByte()); + // Bitwise Or to combine the flagged matches for the second, third and fourth values to our match flags + matches |= Avx2.MoveMask(Avx2.CompareEqual(values1, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values2, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values3, search).AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create((ushort)value0); + Vector128 values1 = Vector128.Create((ushort)value1); + Vector128 values2 = Vector128.Create((ushort)value2); + Vector128 values3 = Vector128.Create((ushort)value3); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchStart, offset); + + matches = Sse2.MoveMask(Sse2.CompareEqual(values0, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values1, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values2, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values3, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector128.Count; + continue; + } + + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask(Sse2.CompareEqual(values0, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values1, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values2, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values3, search).AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset, + // flags are in bytes so divide by 2 for chars (shift right by 1) + offset += (nuint)(uint)BitOperations.TrailingZeroCount(matches) >> 1; + goto Found; + } + + VectorCompare: + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (!Sse2.IsSupported && Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + Vector values3 = new Vector(value3); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchStart, offset); + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)), + Vector.Equals(search, values3)); + if (Vector.Zero.Equals(search)) + { + // None matched + offset += (nuint)Vector.Count; + continue; + } + + goto VectorMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchStart, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)), + Vector.Equals(search, values3)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + VectorMatch: + offset += (nuint)(uint)LocateFirstFoundChar(search); + goto Found; + } + + Debug.Fail("Unreachable"); + goto NotFound; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, T value3, int length) + => IndexOfAnyExcept(ref searchSpace, value0, value1, value2, value3, length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value, int length) + => LastIndexOfAnyExcept(ref searchSpace, value, length); + + internal static int LastIndexOfAnyValueType(ref byte searchSpace, byte value0, byte value1, int length) + { + Debug.Assert(length >= 0); + + uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue1 = value1; + nuint offset = (nuint)(uint)length; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + { + lengthToExamine = UnalignedCountVectorFromEnd(ref searchSpace, length); + } + SequentialScan: + uint lookUp; + while (lengthToExamine >= 8) + { + lengthToExamine -= 8; + offset -= 8; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found7; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found6; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found5; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found4; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found3; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found; + } + + if (lengthToExamine >= 4) + { + lengthToExamine -= 4; + offset -= 4; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found3; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found; + } + + while (lengthToExamine > 0) + { + lengthToExamine -= 1; + offset -= 1; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) + goto Found; + } + + if (Vector.IsHardwareAccelerated && (offset > 0)) + { + lengthToExamine = (offset & (nuint)~(Vector.Count - 1)); + + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + + while (lengthToExamine > (nuint)(Vector.Count - 1)) + { + Vector search = LoadVector(ref searchSpace, offset - (nuint)Vector.Count); + var matches = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(matches)) + { + offset -= (nuint)Vector.Count; + lengthToExamine -= (nuint)Vector.Count; + continue; + } + + // Find offset of first match and add to current offset + return (int)(offset) - Vector.Count + LocateLastFoundByte(matches); + } + + if (offset > 0) + { + lengthToExamine = offset; + goto SequentialScan; + } + } + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LastIndexOfAnyValueType(ref short searchSpace, short value0, short value1, int length) + => LastIndexOfAny(ref searchSpace, value0, value1, length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, int length) + => LastIndexOfAnyExcept(ref searchSpace, value0, value1, length); + + internal static int LastIndexOfAnyValueType(ref byte searchSpace, byte value0, byte value1, byte value2, int length) + { + Debug.Assert(length >= 0); + + uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue1 = value1; + uint uValue2 = value2; + nuint offset = (nuint)(uint)length; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)(uint)length; + + if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + { + lengthToExamine = UnalignedCountVectorFromEnd(ref searchSpace, length); + } + SequentialScan: + uint lookUp; + while (lengthToExamine >= 8) + { + lengthToExamine -= 8; + offset -= 8; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found7; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found6; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found5; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found4; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found3; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found; + } + + if (lengthToExamine >= 4) + { + lengthToExamine -= 4; + offset -= 4; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found3; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found2; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found1; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found; + } + + while (lengthToExamine > 0) + { + lengthToExamine -= 1; + offset -= 1; + + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) + goto Found; + } + + if (Vector.IsHardwareAccelerated && (offset > 0)) + { + lengthToExamine = (offset & (nuint)~(Vector.Count - 1)); + + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + + while (lengthToExamine > (nuint)(Vector.Count - 1)) + { + Vector search = LoadVector(ref searchSpace, offset - (nuint)Vector.Count); + + var matches = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + + if (Vector.Zero.Equals(matches)) + { + offset -= (nuint)Vector.Count; + lengthToExamine -= (nuint)Vector.Count; + continue; + } + + // Find offset of first match and add to current offset + return (int)(offset) - Vector.Count + LocateLastFoundByte(matches); + } + + if (offset > 0) + { + lengthToExamine = offset; + goto SequentialScan; + } + } + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LastIndexOfAnyValueType(ref short searchSpace, short value0, short value1, short value2, int length) + => LastIndexOfAny(ref searchSpace, value0, value1, value2, length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, int length) + => LastIndexOfAnyExcept(ref searchSpace, value0, value1, value2, length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, T value3, int length) + => LastIndexOfAnyExcept(ref searchSpace, value0, value1, value2, value3, length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 LoadVector128(ref char start, nint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 LoadVector128(ref char start, nuint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (nint)offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 LoadVector256(ref char start, nint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 LoadVector256(ref char start, nuint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (nint)offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ref char Add(ref char start, nuint offset) => ref Unsafe.Add(ref start, (nint)offset); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint FindFirstMatchedLane(Vector128 compareResult) + { + Debug.Assert(AdvSimd.Arm64.IsSupported); + + // Mask to help find the first lane in compareResult that is set. + // MSB 0x10 corresponds to 1st lane, 0x01 corresponds to 0th lane and so forth. + Vector128 mask = Vector128.Create((ushort)0x1001).AsByte(); + + // Find the first lane that is set inside compareResult. + Vector128 maskedSelectedLanes = AdvSimd.And(compareResult, mask); + Vector128 pairwiseSelectedLane = AdvSimd.Arm64.AddPairwise(maskedSelectedLanes, maskedSelectedLanes); + ulong selectedLanes = pairwiseSelectedLane.AsUInt64().ToScalar(); + + // It should be handled by compareResult != Vector.Zero + Debug.Assert(selectedLanes != 0); + + // Find the first lane that is set inside compareResult. + return (uint)BitOperations.TrailingZeroCount(selectedLanes) >> 2; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int FindFirstMatchedLane(Vector128 compareResult) + { + Debug.Assert(AdvSimd.Arm64.IsSupported); + + Vector128 pairwiseSelectedLane = AdvSimd.Arm64.AddPairwise(compareResult.AsByte(), compareResult.AsByte()); + ulong selectedLanes = pairwiseSelectedLane.AsUInt64().ToScalar(); + + // It should be handled by compareResult != Vector.Zero + Debug.Assert(selectedLanes != 0); + + return BitOperations.TrailingZeroCount(selectedLanes) >> 3; + } + + // Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138 + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int LocateLastFoundChar(Vector match) + { + var vector64 = Vector.AsVectorUInt64(match); + ulong candidate = 0; + int i = Vector.Count - 1; + + // This pattern is only unrolled by the Jit if the limit is Vector.Count + // As such, we need a dummy iteration variable for that condition to be satisfied + for (int j = 0; j < Vector.Count; j++) + { + candidate = vector64[i]; + if (candidate != 0) + { + break; + } + + i--; + } + + // Single LEA instruction with jitted const (using function result) + return i * 4 + LocateLastFoundChar(candidate); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int LocateLastFoundChar(ulong match) + => BitOperations.Log2(match) >> 4; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nuint UnalignedCountVectorFromEnd(ref byte searchSpace, int length) + { + nint unaligned = (nint)Unsafe.AsPointer(ref searchSpace) & (Vector.Count - 1); + return (nuint)(uint)(((length & (Vector.Count - 1)) + unaligned) & (Vector.Count - 1)); + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs index 2cb5c28fef430..5a4f52b4e679c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs @@ -1442,6 +1442,7 @@ internal static bool ContainsValueType(ref T searchSpace, T value, int length return false; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfChar(ref char searchSpace, char value, int length) => IndexOfValueType(ref Unsafe.As(ref searchSpace), (short)value, length); @@ -1453,6 +1454,7 @@ internal static int IndexOfValueType(ref T searchSpace, T value, int length) [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value, int length) where T : struct, INumber => IndexOfValueType>(ref searchSpace, value, length); +#endif [MethodImpl(MethodImplOptions.AggressiveOptimization)] private static int IndexOfValueType(ref TValue searchSpace, TValue value, int length) @@ -1567,6 +1569,7 @@ private static int IndexOfValueType(ref TValue searchSpace, TV return -1; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfAnyChar(ref char searchSpace, char value0, char value1, int length) => IndexOfAnyValueType(ref Unsafe.As(ref searchSpace), (short)value0, (short)value1, length); @@ -1578,6 +1581,7 @@ internal static int IndexOfAnyValueType(ref T searchSpace, T value0, T value1 [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, int length) where T : struct, INumber => IndexOfAnyValueType>(ref searchSpace, value0, value1, length); +#endif // having INumber constraint here allows to use == operator and get better perf compared to .Equals [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -1716,6 +1720,7 @@ private static int IndexOfAnyValueType(ref TValue searchSpace, return -1; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfAnyValueType(ref T searchSpace, T value0, T value1, T value2, int length) where T : struct, INumber => IndexOfAnyValueType>(ref searchSpace, value0, value1, value2, length); @@ -1723,6 +1728,7 @@ internal static int IndexOfAnyValueType(ref T searchSpace, T value0, T value1 [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, int length) where T : struct, INumber => IndexOfAnyValueType>(ref searchSpace, value0, value1, value2, length); +#endif [MethodImpl(MethodImplOptions.AggressiveOptimization)] private static int IndexOfAnyValueType(ref TValue searchSpace, TValue value0, TValue value1, TValue value2, int length) @@ -1860,6 +1866,7 @@ private static int IndexOfAnyValueType(ref TValue searchSpace, return -1; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfAnyValueType(ref T searchSpace, T value0, T value1, T value2, T value3, int length) where T : struct, INumber => IndexOfAnyValueType>(ref searchSpace, value0, value1, value2, value3, length); @@ -1867,6 +1874,7 @@ internal static int IndexOfAnyValueType(ref T searchSpace, T value0, T value1 [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int IndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, T value3, int length) where T : struct, INumber => IndexOfAnyValueType>(ref searchSpace, value0, value1, value2, value3, length); +#endif [MethodImpl(MethodImplOptions.AggressiveOptimization)] private static int IndexOfAnyValueType(ref TValue searchSpace, TValue value0, TValue value1, TValue value2, TValue value3, int length) @@ -2093,6 +2101,7 @@ internal static int IndexOfAnyValueType(ref T searchSpace, T value0, T value1 return -1; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfValueType(ref T searchSpace, T value, int length) where T : struct, INumber => LastIndexOfValueType>(ref searchSpace, value, length); @@ -2100,6 +2109,7 @@ internal static int LastIndexOfValueType(ref T searchSpace, T value, int leng [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value, int length) where T : struct, INumber => LastIndexOfValueType>(ref searchSpace, value, length); +#endif [MethodImpl(MethodImplOptions.AggressiveOptimization)] private static int LastIndexOfValueType(ref TValue searchSpace, TValue value, int length) @@ -2211,6 +2221,7 @@ private static int LastIndexOfValueType(ref TValue searchSpace return -1; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfAnyValueType(ref T searchSpace, T value0, T value1, int length) where T : struct, INumber => LastIndexOfAnyValueType>(ref searchSpace, value0, value1, length); @@ -2218,6 +2229,7 @@ internal static int LastIndexOfAnyValueType(ref T searchSpace, T value0, T va [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, int length) where T : struct, INumber => LastIndexOfAnyValueType>(ref searchSpace, value0, value1, length); +#endif [MethodImpl(MethodImplOptions.AggressiveOptimization)] private static int LastIndexOfAnyValueType(ref TValue searchSpace, TValue value0, TValue value1, int length) @@ -2350,6 +2362,7 @@ private static int LastIndexOfAnyValueType(ref TValue searchSp return -1; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfAnyValueType(ref T searchSpace, T value0, T value1, T value2, int length) where T : struct, INumber => LastIndexOfAnyValueType>(ref searchSpace, value0, value1, value2, length); @@ -2357,6 +2370,7 @@ internal static int LastIndexOfAnyValueType(ref T searchSpace, T value0, T va [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, int length) where T : struct, INumber => LastIndexOfAnyValueType>(ref searchSpace, value0, value1, value2, length); +#endif [MethodImpl(MethodImplOptions.AggressiveOptimization)] private static int LastIndexOfAnyValueType(ref TValue searchSpace, TValue value0, TValue value1, TValue value2, int length) @@ -2490,6 +2504,7 @@ private static int LastIndexOfAnyValueType(ref TValue searchSp return -1; } +#if !MONO [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfAnyValueType(ref T searchSpace, T value0, T value1, T value2, T value3, int length) where T : struct, INumber => LastIndexOfAnyValueType>(ref searchSpace, value0, value1, value2, value3, length); @@ -2497,6 +2512,7 @@ internal static int LastIndexOfAnyValueType(ref T searchSpace, T value0, T va [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int LastIndexOfAnyExceptValueType(ref T searchSpace, T value0, T value1, T value2, T value3, int length) where T : struct, INumber => LastIndexOfAnyValueType>(ref searchSpace, value0, value1, value2, value3, length); +#endif [MethodImpl(MethodImplOptions.AggressiveOptimization)] private static int LastIndexOfAnyValueType(ref TValue searchSpace, TValue value0, TValue value1, TValue value2, TValue value3, int length)