From 69a7473bbaede7be2c3ad2a518b3b892765a50a0 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Sat, 6 Jul 2024 09:25:23 +0300 Subject: [PATCH 01/11] Rewrite GetIndexOfFirstNonAsciiByte - Deduplicate SIMD paths - there is no more need for that - Remove at least one branch from short length path - Use up to 128x4/256x2 SIMD unrolling --- .../src/System/Text/ASCIIEncoding.cs | 3 +- .../src/System/Text/Ascii.Utility.cs | 700 ++++-------------- .../Text/Unicode/Utf8Utility.Validation.cs | 3 +- 3 files changed, 162 insertions(+), 544 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs index 99fec9d87362e6..360e7a08d1e2e8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs @@ -532,7 +532,8 @@ private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int { // Unrecognized fallback mechanism - count bytes manually. - charCount = (int)Ascii.GetIndexOfFirstNonAsciiByte(pBytes, (uint)bytesLength); + // TODO: Migrate to byref + charCount = (int)Ascii.GetIndexOfFirstNonAsciiByte(ref Unsafe.AsRef(pBytes), (uint)bytesLength); } bytesConsumed = charCount; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index aaeec394495fd8..ee8d423b06119e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -53,29 +53,6 @@ private static bool AllCharsInUInt64AreAscii(ulong value) : AllCharsInUInt64AreAscii(value); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] - private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 value, Vector128 bitmask) - { - if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) - { - throw new PlatformNotSupportedException(); - } - - // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2))); - Vector128 mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); - Vector128 extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitmask); - - // collapse mask to lower bits - extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); - ulong mask = extractedBits.AsUInt64().ToScalar(); - - // calculate the index - int index = BitOperations.TrailingZeroCount(mask) >> 2; - Debug.Assert((mask != 0) ? index < 16 : index >= 16); - return index; - } - /// /// Given a DWORD which represents two packed chars in machine-endian order, /// iff the first char (in machine-endian order) is ASCII. @@ -94,613 +71,252 @@ private static bool FirstCharInUInt32IsAscii(uint value) /// Returns if the buffer is empty or all-ASCII. /// /// An ASCII byte is defined as 0x00 - 0x7F, inclusive. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength) + internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint bufferLength) { - // If 256/512-bit aren't supported but SSE2 is supported, use those specific intrinsics instead of - // the generic vectorized code. This has two benefits: (a) we can take advantage of specific instructions - // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while - // this method is running. + uint found4; + nuint index = 0; - if (!Vector512.IsHardwareAccelerated && - !Vector256.IsHardwareAccelerated && - (Sse2.IsSupported || AdvSimd.IsSupported)) + if (bufferLength >= 8) { - return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength); + goto Above7; } - else - { - // Handles Vector512, Vector256, Vector128, and scalar. - return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); - } - } - - private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nuint bufferLength) - { - // Squirrel away the original buffer reference. This method works by determining the exact - // byte reference where non-ASCII data begins, so we need this base value to perform the - // final subtraction at the end of the method to get the index into the original buffer. - - byte* pOriginalBuffer = pBuffer; - - // Before we drain off byte-by-byte, try a generic vectorized loop. - // Only run the loop if we have at least two vectors we can pull out. - // Note use of SBYTE instead of BYTE below; we're using the two's-complement - // representation of negative integers to act as a surrogate for "is ASCII?". - - if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) - { - if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) - { - // The first several elements of the input buffer were ASCII. Bump up the pointer to the - // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII - // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - - byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector512.Size; - pBuffer = (byte*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1)); - -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= Vector512.Size, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif - - Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); - - do - { - Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); - if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) - { - break; // found non-ASCII data - } - - pBuffer += Vector512.Size; - } while (pBuffer <= pFinalVectorReadPos); - - // Adjust the remaining buffer length for the number of elements we just consumed. - - bufferLength -= (nuint)pBuffer; - bufferLength += (nuint)pOriginalBuffer; - } - } - else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) - { - if (Vector256.Load(pBuffer).ExtractMostSignificantBits() == 0) - { - // The first several elements of the input buffer were ASCII. Bump up the pointer to the - // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII - // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - - byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector256.Size; - pBuffer = (byte*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1)); - -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= Vector256.Size, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif - - Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); - - do - { - Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); - if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) - { - break; // found non-ASCII data - } - - pBuffer += Vector256.Size; - } while (pBuffer <= pFinalVectorReadPos); - - // Adjust the remaining buffer length for the number of elements we just consumed. - - bufferLength -= (nuint)pBuffer; - bufferLength += (nuint)pOriginalBuffer; - } - } - else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) - { - if (!VectorContainsNonAsciiChar(Vector128.Load(pBuffer))) - { - // The first several elements of the input buffer were ASCII. Bump up the pointer to the - // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII - // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - - byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector128.Size; - pBuffer = (byte*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); - -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= Vector128.Size, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif - - Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); - - do - { - Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned."); - if (VectorContainsNonAsciiChar(Vector128.LoadAligned(pBuffer))) - { - break; // found non-ASCII data - } - - pBuffer += Vector128.Size; - } while (pBuffer <= pFinalVectorReadPos); - - // Adjust the remaining buffer length for the number of elements we just consumed. - - bufferLength -= (nuint)pBuffer; - bufferLength += (nuint)pOriginalBuffer; - } - } - - // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform - // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code - // path to drain any remaining ASCII bytes. - // - // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads. - // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes. - - uint currentUInt32; - - // Try reading 64 bits at a time in a loop. - - for (; bufferLength >= 8; bufferLength -= 8) - { - currentUInt32 = Unsafe.ReadUnaligned(pBuffer); - uint nextUInt32 = Unsafe.ReadUnaligned(pBuffer + 4); - - if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32)) - { - // One of these two values contains non-ASCII bytes. - // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes. - - if (AllBytesInUInt32AreAscii(currentUInt32)) - { - currentUInt32 = nextUInt32; - pBuffer += 4; - } - - goto FoundNonAsciiData; - } - - pBuffer += 8; // consumed 8 ASCII bytes - } - - // From this point forward we don't need to update bufferLength. - // Try reading 32 bits. if ((bufferLength & 4) != 0) { - currentUInt32 = Unsafe.ReadUnaligned(pBuffer); - if (!AllBytesInUInt32AreAscii(currentUInt32)) + uint test4 = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x80808080; + if (test4 != 0) { - goto FoundNonAsciiData; + found4 = test4; + goto Found1to7; } - - pBuffer += 4; + index += 4; } - // Try reading 16 bits. - if ((bufferLength & 2) != 0) { - currentUInt32 = Unsafe.ReadUnaligned(pBuffer); - if (!AllBytesInUInt32AreAscii(currentUInt32)) + uint test2 = (uint)Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x8080; + if (test2 != 0) { - if (!BitConverter.IsLittleEndian) - { - currentUInt32 <<= 16; - } - goto FoundNonAsciiData; + found4 = test2; + goto Found1to7; } - - pBuffer += 2; + index += 2; } - // Try reading 8 bits - if ((bufferLength & 1) != 0) { - // If the buffer contains non-ASCII data, the comparison below will fail, and - // we'll end up not incrementing the buffer reference. - - if (*(sbyte*)pBuffer >= 0) + if (Unsafe.Add(ref pBuffer, index) < 0x80) { - pBuffer++; + index++; } } - Finish: - - nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer; - return totalNumBytesRead; - - FoundNonAsciiData: - - Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input."); - - // The method being called doesn't bother looking at whether the high byte is ASCII. There are only - // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before - // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be - // non-ASCII. In both cases we only care about the low 24 bits. - - pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32); - goto Finish; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool ContainsNonAsciiByte_Sse2(uint sseMask) - { - Debug.Assert(sseMask != uint.MaxValue); - Debug.Assert(Sse2.IsSupported); - return sseMask != 0; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool ContainsNonAsciiByte_AdvSimd(uint advSimdIndex) - { - Debug.Assert(advSimdIndex != uint.MaxValue); - Debug.Assert(AdvSimd.IsSupported); - return advSimdIndex < 16; - } - - private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuffer, nuint bufferLength) - { - // JIT turns the below into constants - - uint SizeOfVector128 = (uint)sizeof(Vector128); - nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); - - Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Sse2 or AdvSimd64 required."); - Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 implementation assumes little-endian."); - - Vector128 bitmask = BitConverter.IsLittleEndian ? - Vector128.Create((ushort)0x1001).AsByte() : - Vector128.Create((ushort)0x0110).AsByte(); - - uint currentSseMask = uint.MaxValue, secondSseMask = uint.MaxValue; - uint currentAdvSimdIndex = uint.MaxValue, secondAdvSimdIndex = uint.MaxValue; - byte* pOriginalBuffer = pBuffer; + Done: + return index; - // This method is written such that control generally flows top-to-bottom, avoiding - // jumps as much as possible in the optimistic case of a large enough buffer and - // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets - // after all the main logic. + Found1to7: + index += uint.TrailingZeroCount(found4) / 8; + goto Done; - if (bufferLength < SizeOfVector128) + Above7: + ulong found8; + if (bufferLength <= 16) { - goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead - } + ulong first = Unsafe.ReadUnaligned(ref pBuffer); + ulong last = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, bufferLength - 8)); - // Read the first vector unaligned. + first &= 0x8080808080808080; + last &= 0x8080808080808080; - if (Sse2.IsSupported) - { - currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load - if (ContainsNonAsciiByte_Sse2(currentSseMask)) + if (first != 0) { - goto FoundNonAsciiDataInCurrentChunk; + found8 = first; } - } - else if (AdvSimd.Arm64.IsSupported) - { - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + else if (last != 0) { - goto FoundNonAsciiDataInCurrentChunk; + found8 = last; + index = bufferLength - 8; } - } - else - { - throw new PlatformNotSupportedException(); - } - - // If we have less than 32 bytes to process, just go straight to the final unaligned - // read. There's no need to mess with the loop logic in the middle of this method. - - if (bufferLength < 2 * SizeOfVector128) - { - goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead; - } - - // Now adjust the read pointer so that future reads are aligned. - - pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128); - -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif - - // Adjust the remaining length to account for what we just read. - - bufferLength += (nuint)pOriginalBuffer; - bufferLength -= (nuint)pBuffer; - - // The buffer is now properly aligned. - // Read 2 vectors at a time if possible. - - if (bufferLength >= 2 * SizeOfVector128) - { - byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128); - - // After this point, we no longer need to update the bufferLength value. - - do + else { - if (Sse2.IsSupported) - { - Vector128 firstVector = Sse2.LoadAlignedVector128(pBuffer); - Vector128 secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128); - - currentSseMask = (uint)Sse2.MoveMask(firstVector); - secondSseMask = (uint)Sse2.MoveMask(secondVector); - if (ContainsNonAsciiByte_Sse2(currentSseMask | secondSseMask)) - { - goto FoundNonAsciiDataInInnerLoop; - } - } - else if (AdvSimd.Arm64.IsSupported) - { - Vector128 firstVector = AdvSimd.LoadVector128(pBuffer); - Vector128 secondVector = AdvSimd.LoadVector128(pBuffer + SizeOfVector128); - - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(firstVector, bitmask); - secondAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(secondVector, bitmask); - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex) || ContainsNonAsciiByte_AdvSimd(secondAdvSimdIndex)) - { - goto FoundNonAsciiDataInInnerLoop; - } - } - else - { - throw new PlatformNotSupportedException(); - } + return bufferLength; + } - pBuffer += 2 * SizeOfVector128; - } while (pBuffer <= pFinalVectorReadPos); + return index += (nuint)(ulong.TrailingZeroCount(found8) / 8); } - // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from. - // Since the above loop doesn't update bufferLength, we can't rely on its absolute value. - // But we _can_ rely on it to tell us how much remaining data must be drained by looking - // at what bits of it are set. This works because had we updated it within the loop above, - // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about - // bits which are less significant than those that the addition would've acted on. - - // If there is fewer than one vector length remaining, skip the next aligned read. + return GetIndexOfFirstNonAsciiByte_Vector(ref pBuffer, bufferLength); + } - if ((bufferLength & SizeOfVector128) == 0) + [MethodImpl(MethodImplOptions.NoInlining)] + private static nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint bufferLength) + { + Debug.Assert(bufferLength >= 16); + if (bufferLength < 32) { - goto DoFinalUnalignedVectorRead; - } + Vector128 first = Vector128.GreaterThanOrEqual( + Vector128.LoadUnsafe(ref pBuffer), + Vector128.Create((byte)0x80)); + Vector128 last = Vector128.GreaterThanOrEqual( + Vector128.LoadUnsafe(ref pBuffer, bufferLength - 16), + Vector128.Create((byte)0x80)); - // At least one full vector's worth of data remains, so we can safely read it. - // Remember, at this point pBuffer is still aligned. + bool foundFirst = first != Vector128.Zero; + bool foundLast = last != Vector128.Zero; - if (Sse2.IsSupported) - { - currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer)); - if (ContainsNonAsciiByte_Sse2(currentSseMask)) + nuint offset; + Vector128 found; + if (foundFirst) { - goto FoundNonAsciiDataInCurrentChunk; + offset = 0; + found = first; + goto Found128; } - } - else if (AdvSimd.Arm64.IsSupported) - { - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + if (foundLast) { - goto FoundNonAsciiDataInCurrentChunk; + offset = bufferLength - 16; + found = last; + goto Found128; } - } - else - { - throw new PlatformNotSupportedException(); - } - - IncrementCurrentOffsetBeforeFinalUnalignedVectorRead: - pBuffer += SizeOfVector128; + return bufferLength; - DoFinalUnalignedVectorRead: + Found128: + return ComputeIndex(found) + offset; + } - if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0) + if (bufferLength < 64) { - // Perform an unaligned read of the last vector. - // We need to adjust the pointer because we're re-reading data. - - pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128; - - if (Sse2.IsSupported) - { - currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load - if (ContainsNonAsciiByte_Sse2(currentSseMask)) - { - goto FoundNonAsciiDataInCurrentChunk; - } + Vector256 first = Vector256.LoadUnsafe(ref pBuffer); + Vector256 last = Vector256.LoadUnsafe(ref pBuffer, bufferLength - 32); - } - else if (AdvSimd.Arm64.IsSupported) + nuint offset; + Vector256 found; + if (Test256(first)) { - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) - { - goto FoundNonAsciiDataInCurrentChunk; - } - + offset = 0; + found = first; + goto Found256; } - else + if (Test256(last)) { - throw new PlatformNotSupportedException(); + offset = bufferLength - 32; + found = last; + goto Found256; } - pBuffer += SizeOfVector128; - } - - Finish: - return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done! - - FoundNonAsciiDataInInnerLoop: + return bufferLength; - // If the current (first) mask isn't the mask that contains non-ASCII data, then it must - // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes - // from the second mask. - - if (Sse2.IsSupported) - { - if (!ContainsNonAsciiByte_Sse2(currentSseMask)) - { - pBuffer += SizeOfVector128; - currentSseMask = secondSseMask; - } + Found256: + found = Vector256.GreaterThanOrEqual(found, Vector256.Create((byte)0x80)); + return ComputeIndex(found) + offset; } - else if (AdvSimd.IsSupported) + + ref byte pCurrent = ref pBuffer; + ref byte pLast = ref Unsafe.Add(ref pBuffer, bufferLength - 64); + Vector512 found512; + do { - if (!ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + Vector512 current512 = Vector512.LoadUnsafe(ref pCurrent); + if (Test512(current512)) { - pBuffer += SizeOfVector128; - currentAdvSimdIndex = secondAdvSimdIndex; + found512 = current512; + goto Found512; } - } - else - { - throw new PlatformNotSupportedException(); - } - FoundNonAsciiDataInCurrentChunk: + pCurrent = ref Unsafe.Add(ref pCurrent, 64); + } while (!Unsafe.IsAddressGreaterThan(ref pCurrent, ref pLast)); - if (Sse2.IsSupported) - { - // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte. - // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't - // available, we'll fall back to a normal loop. - Debug.Assert(ContainsNonAsciiByte_Sse2(currentSseMask), "Shouldn't be here unless we see non-ASCII data."); - pBuffer += (uint)BitOperations.TrailingZeroCount(currentSseMask); - } - else if (AdvSimd.Arm64.IsSupported) + Vector512 last512 = Vector512.LoadUnsafe(ref pLast); + if (Test512(last512)) { - Debug.Assert(ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex), "Shouldn't be here unless we see non-ASCII data."); - pBuffer += currentAdvSimdIndex; - } - else - { - throw new PlatformNotSupportedException(); + pCurrent = ref pLast; + found512 = last512; + goto Found512; } - goto Finish; + return bufferLength; - FoundNonAsciiDataInCurrentDWord: + Found512: + found512 = Vector512.GreaterThanOrEqual(found512, Vector512.Create((byte)0x80)); + return ComputeIndex(found512) + (nuint)Unsafe.ByteOffset(ref pBuffer, ref pCurrent); - uint currentDWord; - Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data."); - pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord); - - goto Finish; - - InputBufferLessThanOneVectorInLength: - - // These code paths get hit if the original input length was less than one vector in size. - // We can't perform vectorized reads at this point, so we'll fall back to reading primitives - // directly. Note that all of these reads are unaligned. - - Debug.Assert(bufferLength < SizeOfVector128); - - // QWORD drain - - if ((bufferLength & 8) != 0) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool Test256(Vector256 value) { - if (UIntPtr.Size == sizeof(ulong)) - { - // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it. - - ulong candidateUInt64 = Unsafe.ReadUnaligned(pBuffer); - if (!AllBytesInUInt64AreAscii(candidateUInt64)) - { - // Clear everything but the high bit of each byte, then tzcnt. - // Remember to divide by 8 at the end to convert bit count to byte count. - - candidateUInt64 &= UInt64HighBitsOnlyMask; - pBuffer += (nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3); - goto Finish; - } - } - else + if (Vector256.IsHardwareAccelerated) { - // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead. - - currentDWord = Unsafe.ReadUnaligned(pBuffer); - uint nextDWord = Unsafe.ReadUnaligned(pBuffer + 4); - - if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord)) - { - // At least one of the values wasn't all-ASCII. - // We need to figure out which one it was and stick it in the currentMask local. - - if (AllBytesInUInt32AreAscii(currentDWord)) - { - currentDWord = nextDWord; // this one is the culprit - pBuffer += 4; - } - - goto FoundNonAsciiDataInCurrentDWord; - } + return (value & Vector256.Create((byte)0x80)) != Vector256.Zero; } - pBuffer += 8; // successfully consumed 8 ASCII bytes + return ((value.GetLower() | value.GetUpper()) & Vector128.Create((byte)0x80)) != Vector128.Zero; } - // DWORD drain - - if ((bufferLength & 4) != 0) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool Test512(Vector512 value) { - currentDWord = Unsafe.ReadUnaligned(pBuffer); + if (Vector512.IsHardwareAccelerated) + { + return (value & Vector512.Create((byte)0x80)) != Vector512.Zero; + } - if (!AllBytesInUInt32AreAscii(currentDWord)) + if (Vector256.IsHardwareAccelerated) { - goto FoundNonAsciiDataInCurrentDWord; + return ((value.GetLower() | value.GetUpper()) & Vector256.Create((byte)0x80)) != Vector256.Zero; } - pBuffer += 4; // successfully consumed 4 ASCII bytes + Vector128 reduced = ( + value.GetLower().GetLower() | + value.GetLower().GetUpper() | + value.GetUpper().GetLower() | + value.GetUpper().GetUpper()) & Vector128.Create((byte)0x80); + return reduced != Vector128.Zero; } + } - // WORD drain - // (We movzx to a DWORD for ease of manipulation.) - - if ((bufferLength & 2) != 0) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint ComputeIndex(Vector128 value) + { + if (AdvSimd.IsSupported) { - currentDWord = Unsafe.ReadUnaligned(pBuffer); - - if (!AllBytesInUInt32AreAscii(currentDWord)) - { - // We only care about the 0x0080 bit of the value. If it's not set, then we - // increment currentOffset by 1. If it's set, we don't increment it at all. + ulong shrn = AdvSimd + .ShiftRightLogicalNarrowingLower(value.AsUInt16(), 4) + .AsUInt64() + .ToScalar(); + return (nuint)(ulong.TrailingZeroCount(shrn) / 4); + } - pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1; - goto Finish; - } + return nuint.TrailingZeroCount(value.ExtractMostSignificantBits()); + } - pBuffer += 2; // successfully consumed 2 ASCII bytes + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint ComputeIndex(Vector256 value) + { + if (!Vector256.IsHardwareAccelerated) + { + Vector128 lower = value.GetLower(); + Vector128 upper = value.GetUpper(); + return lower != Vector128.Zero + ? ComputeIndex(lower) + : ComputeIndex(upper) + 16; } - // BYTE drain + return nuint.TrailingZeroCount(value.ExtractMostSignificantBits()); + } - if ((bufferLength & 1) != 0) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint ComputeIndex(Vector512 value) + { + if (!Vector512.IsHardwareAccelerated) { - // sbyte has non-negative value if byte is ASCII. - - if (*(sbyte*)(pBuffer) >= 0) - { - pBuffer++; // successfully consumed a single byte - } + Vector256 lower = value.GetLower(); + Vector256 upper = value.GetUpper(); + return lower != Vector256.Zero + ? ComputeIndex(lower) + : ComputeIndex(upper) + 32; } - goto Finish; + return (nuint)ulong.TrailingZeroCount(value.ExtractMostSignificantBits()); } /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index a542dad72b5c33..b4558be5d3116d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -26,8 +26,9 @@ internal static unsafe partial class Utf8Utility Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + // TODO: Migrate to byref // First, try to drain off as many ASCII bytes as we can from the beginning. - nuint numAsciiBytesCounted = Ascii.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength); + nuint numAsciiBytesCounted = Ascii.GetIndexOfFirstNonAsciiByte(ref Unsafe.AsRef(pInputBuffer), (uint)inputLength); pInputBuffer += numAsciiBytesCounted; // Quick check - did we just end up consuming the entire input buffer? From f04b90ce16372036a35a90e0821bfdf2f83deec6 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Sat, 6 Jul 2024 09:53:43 +0300 Subject: [PATCH 02/11] Improve lengths 32 and 64 --- .../System.Private.CoreLib/src/System/Text/Ascii.Utility.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index ee8d423b06119e..c60e9a6ac3bb49 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -152,7 +152,7 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer private static nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint bufferLength) { Debug.Assert(bufferLength >= 16); - if (bufferLength < 32) + if (bufferLength <= 32) { Vector128 first = Vector128.GreaterThanOrEqual( Vector128.LoadUnsafe(ref pBuffer), @@ -185,7 +185,7 @@ private static nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint return ComputeIndex(found) + offset; } - if (bufferLength < 64) + if (bufferLength <= 64) { Vector256 first = Vector256.LoadUnsafe(ref pBuffer); Vector256 last = Vector256.LoadUnsafe(ref pBuffer, bufferLength - 32); From 20336923dac98cf0fe0f99709cdb15b6cbad722a Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Sun, 7 Jul 2024 21:17:09 +0300 Subject: [PATCH 03/11] Reduce scalar path code size, use cheaper instructions on x86, align 512b loop, mitigate Dynamic PGO impact --- .../src/System/Text/Ascii.Utility.cs | 159 ++++++++++-------- 1 file changed, 91 insertions(+), 68 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index c60e9a6ac3bb49..258258746f9eb4 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -73,52 +73,45 @@ private static bool FirstCharInUInt32IsAscii(uint value) /// An ASCII byte is defined as 0x00 - 0x7F, inclusive. internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint bufferLength) { - uint found4; nuint index = 0; - - if (bufferLength >= 8) + if (bufferLength < 8) { - goto Above7; - } - - if ((bufferLength & 4) != 0) - { - uint test4 = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x80808080; - if (test4 != 0) + uint found4; + if ((bufferLength & 4) != 0) { - found4 = test4; - goto Found1to7; + uint test4 = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x80808080; + if (test4 != 0) + { + found4 = test4; + goto Found1to7; + } + index = 4; } - index += 4; - } - - if ((bufferLength & 2) != 0) - { - uint test2 = (uint)Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x8080; - if (test2 != 0) + if ((bufferLength & 2) != 0) { - found4 = test2; - goto Found1to7; + uint test2 = (uint)Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x8080; + if (test2 != 0) + { + found4 = test2; + goto Found1to7; + } + index += 2; } - index += 2; - } - - if ((bufferLength & 1) != 0) - { - if (Unsafe.Add(ref pBuffer, index) < 0x80) + if ((bufferLength & 1) != 0) { - index++; + if (Unsafe.Add(ref pBuffer, index) < 0x80) + { + index++; + } } - } - Done: - return index; + goto Done; - Found1to7: - index += uint.TrailingZeroCount(found4) / 8; - goto Done; + Found1to7: + index += uint.TrailingZeroCount(found4) / 8; + goto Done; + } - Above7: ulong found8; if (bufferLength <= 16) { @@ -139,30 +132,33 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer } else { - return bufferLength; + index = bufferLength; + goto Done; } - return index += (nuint)(ulong.TrailingZeroCount(found8) / 8); + return index + (nuint)(ulong.TrailingZeroCount(found8) / 8); } return GetIndexOfFirstNonAsciiByte_Vector(ref pBuffer, bufferLength); + + Done: + return index; } - [MethodImpl(MethodImplOptions.NoInlining)] - private static nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint bufferLength) + // Another case where we want to bypass Dynamic PGO - it is very easy to hit + // a condition where a profile with longer path blocks marked as cold leads + // to TestXXX helpers prevented from being inlined, which is disastrous for performance. + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint bufferLength) { Debug.Assert(bufferLength >= 16); if (bufferLength <= 32) { - Vector128 first = Vector128.GreaterThanOrEqual( - Vector128.LoadUnsafe(ref pBuffer), - Vector128.Create((byte)0x80)); - Vector128 last = Vector128.GreaterThanOrEqual( - Vector128.LoadUnsafe(ref pBuffer, bufferLength - 16), - Vector128.Create((byte)0x80)); + Vector128 first = Vector128.LoadUnsafe(ref pBuffer); + Vector128 last = Vector128.LoadUnsafe(ref pBuffer, bufferLength - 16); - bool foundFirst = first != Vector128.Zero; - bool foundLast = last != Vector128.Zero; + bool foundFirst = Test128(first); + bool foundLast = Test128(last); nuint offset; Vector128 found; @@ -182,6 +178,7 @@ private static nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint return bufferLength; Found128: + found = Vector128.GreaterThanOrEqual(found, Vector128.Create((byte)0x80)); return ComputeIndex(found) + offset; } @@ -212,20 +209,40 @@ private static nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint return ComputeIndex(found) + offset; } + Vector512 found512; ref byte pCurrent = ref pBuffer; ref byte pLast = ref Unsafe.Add(ref pBuffer, bufferLength - 64); - Vector512 found512; - do + + // SAFETY: We only use the value of .AsPointer to calculate the byte offset + // to the next 64B boundary. Should GC relocate the buffer, we will lose the + // alignment which is an acceptable trade-off over pinning the buffer, once + // the callers of this method are updated to use byrefs. + nuint toAlign = (nuint)Unsafe.AsPointer(ref pCurrent) % 64; + if (toAlign != 0) { - Vector512 current512 = Vector512.LoadUnsafe(ref pCurrent); - if (Test512(current512)) + Vector512 first512 = Vector512.LoadUnsafe(ref pCurrent); + if (Test512(first512)) { - found512 = current512; + found512 = first512; goto Found512; } - pCurrent = ref Unsafe.Add(ref pCurrent, 64); - } while (!Unsafe.IsAddressGreaterThan(ref pCurrent, ref pLast)); + pCurrent = ref Unsafe.Add(ref pCurrent, toAlign); + } + + while (!Unsafe.IsAddressGreaterThan(ref pCurrent, ref pLast)) + { + Vector512 current512 = Vector512.LoadUnsafe(ref pCurrent); + if (!Test512(current512)) + { + // Better loop ordering + pCurrent = ref Unsafe.Add(ref pCurrent, 64); + continue; + } + + found512 = current512; + goto Found512; + } Vector512 last512 = Vector512.LoadUnsafe(ref pLast); if (Test512(last512)) @@ -242,35 +259,41 @@ private static nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint return ComputeIndex(found512) + (nuint)Unsafe.ByteOffset(ref pBuffer, ref pCurrent); [MethodImpl(MethodImplOptions.AggressiveInlining)] - static bool Test256(Vector256 value) + static bool Test128(Vector128 value) { - if (Vector256.IsHardwareAccelerated) + if (Sse2.IsSupported) { - return (value & Vector256.Create((byte)0x80)) != Vector256.Zero; + return value.ExtractMostSignificantBits() != 0; } - return ((value.GetLower() | value.GetUpper()) & Vector128.Create((byte)0x80)) != Vector128.Zero; + return Vector128.GreaterThanOrEqualAny(value, Vector128.Create((byte)0x80)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - static bool Test512(Vector512 value) + static bool Test256(Vector256 value) { - if (Vector512.IsHardwareAccelerated) + if (Avx.IsSupported) { - return (value & Vector512.Create((byte)0x80)) != Vector512.Zero; + return value.ExtractMostSignificantBits() != 0; } if (Vector256.IsHardwareAccelerated) { - return ((value.GetLower() | value.GetUpper()) & Vector256.Create((byte)0x80)) != Vector256.Zero; + return Vector256.GreaterThanOrEqualAny(value, Vector256.Create((byte)0x80)); + } + + return Test128(value.GetLower() | value.GetUpper()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool Test512(Vector512 value) + { + if (Avx512F.IsSupported) + { + return value.ExtractMostSignificantBits() != 0; } - Vector128 reduced = ( - value.GetLower().GetLower() | - value.GetLower().GetUpper() | - value.GetUpper().GetLower() | - value.GetUpper().GetUpper()) & Vector128.Create((byte)0x80); - return reduced != Vector128.Zero; + return Test256(value.GetLower() | value.GetUpper()); } } From 3e3e5274335e92c74eb0fff8c5a640a510a2edb6 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Mon, 15 Jul 2024 14:32:46 +0300 Subject: [PATCH 04/11] Address feedback, add scalar and big endian fallback --- .../src/System/Text/Ascii.Utility.cs | 430 +++++++++++++----- 1 file changed, 311 insertions(+), 119 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 258258746f9eb4..13c875f2c00de9 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -139,207 +139,399 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer return index + (nuint)(ulong.TrailingZeroCount(found8) / 8); } - return GetIndexOfFirstNonAsciiByte_Vector(ref pBuffer, bufferLength); + if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) + { + // TODO: Handle length < 32 here. + if (Vector256.IsHardwareAccelerated) + { + if (bufferLength <= (nuint)Vector256.Count) + { + return Search16To32(ref pBuffer, bufferLength); + } + if (Vector512.IsHardwareAccelerated) + { + if (bufferLength <= (nuint)Vector512.Count) + { + return Search32To64(ref pBuffer, bufferLength); + } + + // This is as wide as newest CPU cores can go at the time of writing. + return GetIndexOfFirstNonAsciiByte_Unrolled2>(ref pBuffer, bufferLength); + } + + return GetIndexOfFirstNonAsciiByte_Unrolled4>(ref pBuffer, bufferLength); + } + + return GetIndexOfFirstNonAsciiByte_Unrolled4>(ref pBuffer, bufferLength); + } + else + { + return GetIndexOfFirstNonAsciiByte_Scalar(ref pBuffer, bufferLength); + } Done: return index; - } - // Another case where we want to bypass Dynamic PGO - it is very easy to hit - // a condition where a profile with longer path blocks marked as cold leads - // to TestXXX helpers prevented from being inlined, which is disastrous for performance. - [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] - private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(ref byte pBuffer, nuint bufferLength) - { - Debug.Assert(bufferLength >= 16); - if (bufferLength <= 32) + static nuint Search16To32(ref byte pBuffer, nuint bufferLength) { + Debug.Assert(bufferLength is > 16 and <= 32); + Debug.Assert(Vector128.IsHardwareAccelerated); + + Vector128 mask = Vector128.Create((byte)0x80); Vector128 first = Vector128.LoadUnsafe(ref pBuffer); Vector128 last = Vector128.LoadUnsafe(ref pBuffer, bufferLength - 16); - bool foundFirst = Test128(first); - bool foundLast = Test128(last); + bool foundFirst = Vector128.GreaterThanOrEqualAny(first, mask); + bool foundLast = Vector128.GreaterThanOrEqualAny(last, mask); - nuint offset; - Vector128 found; - if (foundFirst) + return (foundFirst, foundLast) switch { - offset = 0; - found = first; - goto Found128; - } - if (foundLast) + (true, _) => IndexOfWhereAllBitsSet(Vector128.GreaterThanOrEqual(first, mask)), + (_, true) => IndexOfWhereAllBitsSet(Vector128.GreaterThanOrEqual(last, mask)) + bufferLength - 16, + _ => bufferLength, + }; + } + + static nuint Search32To64(ref byte pBuffer, nuint bufferLength) + { + Debug.Assert(bufferLength is > 32 and <= 64); + Debug.Assert(Vector256.IsHardwareAccelerated); + + Vector256 mask = Vector256.Create((byte)0x80); + Vector256 first = Vector256.LoadUnsafe(ref pBuffer); + Vector256 last = Vector256.LoadUnsafe(ref pBuffer, bufferLength - 32); + + bool foundFirst = Vector256.GreaterThanOrEqualAny(first, mask); + bool foundLast = Vector256.GreaterThanOrEqualAny(last, mask); + + return (foundFirst, foundLast) switch { - offset = bufferLength - 16; - found = last; - goto Found128; + (true, _) => IndexOfWhereAllBitsSet(Vector256.GreaterThanOrEqual(first, mask)), + (_, true) => IndexOfWhereAllBitsSet(Vector256.GreaterThanOrEqual(last, mask)) + bufferLength - 32, + _ => bufferLength, + }; + } + } + + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Scalar(ref byte pBuffer, nuint bufferLength) + { + Debug.Assert(bufferLength > 16); + Debug.Assert(!Vector128.IsHardwareAccelerated || !BitConverter.IsLittleEndian); + + ulong value; + nuint offset = 0; + const ulong mask = 0x8080808080808080; + + nuint align = (nuint)Unsafe.AsPointer(ref pBuffer) % 8; + if (align != 0) + { + value = Unsafe.ReadUnaligned(ref pBuffer) & mask; + if (value != 0) + { + goto Found; } + offset += 8 - align; + } - return bufferLength; + nuint last = bufferLength - 8; + while (offset <= last) + { + value = Unsafe.As(ref Unsafe.Add(ref pBuffer, offset)) & mask; + if (value != 0) + { + goto Found; + } + offset += 8; + } - Found128: - found = Vector128.GreaterThanOrEqual(found, Vector128.Create((byte)0x80)); - return ComputeIndex(found) + offset; + value = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, last)) & mask; + if (value != 0) + { + offset = last; + goto Found; } - if (bufferLength <= 64) + return bufferLength; + + Found: + return offset + (BitConverter.IsLittleEndian + ? (nuint)BitOperations.TrailingZeroCount(value) / 8 + : (nuint)BitOperations.LeadingZeroCount(value) / 8); + } + + // Another case where we want to bypass Dynamic PGO - it is very easy to hit + // a condition where a profile with longer path blocks marked as cold leads + // to TestXXX helpers prevented from being inlined, which is disastrous for performance. + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled2(ref byte pBuffer, nuint bufferLength) + where T : ISimdVector + { + Debug.Assert(T.IsHardwareAccelerated); + Debug.Assert(BitConverter.IsLittleEndian); + Debug.Assert(bufferLength > (nuint)T.Count); + + nuint offset = 0; + nuint width = (nuint)T.Count; + + T v0, v1; + + if (bufferLength <= width * 2) { - Vector256 first = Vector256.LoadUnsafe(ref pBuffer); - Vector256 last = Vector256.LoadUnsafe(ref pBuffer, bufferLength - 32); + v0 = T.LoadUnsafe(ref pBuffer); + v1 = T.LoadUnsafe(ref pBuffer, bufferLength - width); - nuint offset; - Vector256 found; - if (Test256(first)) + bool foundFirst = T.GreaterThanOrEqualAny(v0, T.Create(0x80)); + bool foundLast = T.GreaterThanOrEqualAny(v1, T.Create(0x80)); + + if (foundFirst) { - offset = 0; - found = first; - goto Found256; + v1 = v0; + goto FoundV1; } - if (Test256(last)) + if (foundLast) { - offset = bufferLength - 32; - found = last; - goto Found256; + offset = bufferLength - width; + goto FoundV1; } return bufferLength; - - Found256: - found = Vector256.GreaterThanOrEqual(found, Vector256.Create((byte)0x80)); - return ComputeIndex(found) + offset; } - Vector512 found512; - ref byte pCurrent = ref pBuffer; - ref byte pLast = ref Unsafe.Add(ref pBuffer, bufferLength - 64); - // SAFETY: We only use the value of .AsPointer to calculate the byte offset - // to the next 64B boundary. Should GC relocate the buffer, we will lose the + // to the next vector boundary. Should GC relocate the buffer, we will lose the // alignment which is an acceptable trade-off over pinning the buffer, once // the callers of this method are updated to use byrefs. - nuint toAlign = (nuint)Unsafe.AsPointer(ref pCurrent) % 64; - if (toAlign != 0) + nuint align = (nuint)Unsafe.AsPointer(ref pBuffer) % width; + if (align != 0) { - Vector512 first512 = Vector512.LoadUnsafe(ref pCurrent); - if (Test512(first512)) + v1 = T.LoadUnsafe(ref pBuffer); + if (T.GreaterThanOrEqualAny(v1, T.Create(0x80))) { - found512 = first512; - goto Found512; + goto FoundV1; } - - pCurrent = ref Unsafe.Add(ref pCurrent, toAlign); + offset += align; } - while (!Unsafe.IsAddressGreaterThan(ref pCurrent, ref pLast)) + nuint last = bufferLength - width * 2; + while (offset <= last) { - Vector512 current512 = Vector512.LoadUnsafe(ref pCurrent); - if (!Test512(current512)) + v0 = T.LoadUnsafe(ref pBuffer, offset); + v1 = T.LoadUnsafe(ref pBuffer, offset + width); + if (!Test2(v0, v1)) { - // Better loop ordering - pCurrent = ref Unsafe.Add(ref pCurrent, 64); + offset += width * 2; continue; } - found512 = current512; - goto Found512; + goto FoundV01; } - Vector512 last512 = Vector512.LoadUnsafe(ref pLast); - if (Test512(last512)) + v0 = T.LoadUnsafe(ref pBuffer, last); + v1 = T.LoadUnsafe(ref pBuffer, last + width); + if (Test2(v0, v1)) { - pCurrent = ref pLast; - found512 = last512; - goto Found512; + offset = last; + goto FoundV01; } return bufferLength; - Found512: - found512 = Vector512.GreaterThanOrEqual(found512, Vector512.Create((byte)0x80)); - return ComputeIndex(found512) + (nuint)Unsafe.ByteOffset(ref pBuffer, ref pCurrent); + FoundV01: + nuint index = IndexOfWhereAllBitsSet( + T.GreaterThanOrEqual(v0, T.Create(0x80))); + + if (index < width) + { + return index + offset; + } + offset += width; + + FoundV1: + return offset + IndexOfWhereAllBitsSet( + T.GreaterThanOrEqual(v1, T.Create(0x80))); [MethodImpl(MethodImplOptions.AggressiveInlining)] - static bool Test128(Vector128 value) + static bool Test2(T v0, T v1) + { + return ((v0 | v1) & T.Create(0x80)) != T.Zero; + } + } + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled4(ref byte pBuffer, nuint bufferLength) + where T : ISimdVector + { + Debug.Assert(T.IsHardwareAccelerated); + Debug.Assert(BitConverter.IsLittleEndian); + Debug.Assert(bufferLength > (nuint)T.Count); + + nuint offset = 0; + nuint width = (nuint)T.Count; + + T v0, v1, v2, v3; + + if (bufferLength <= width * 2) { - if (Sse2.IsSupported) + v0 = T.LoadUnsafe(ref pBuffer); + v1 = T.LoadUnsafe(ref pBuffer, bufferLength - width); + + bool foundFirst = T.GreaterThanOrEqualAny(v0, T.Create(0x80)); + bool foundLast = T.GreaterThanOrEqualAny(v1, T.Create(0x80)); + + if (foundFirst) { - return value.ExtractMostSignificantBits() != 0; + goto FoundV0; + } + if (foundLast) + { + offset = bufferLength - width; + v0 = v1; + goto FoundV0; } - return Vector128.GreaterThanOrEqualAny(value, Vector128.Create((byte)0x80)); + return bufferLength; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static bool Test256(Vector256 value) + if (bufferLength <= width * 4) { - if (Avx.IsSupported) + // SAFETY: Use byref arithmetic over offset-based load to ensure + // that the last two vectors are loaded with a single LDP on ARM64. + ref byte pLastTwo = ref Unsafe.Add(ref pBuffer, bufferLength - width * 2); + v0 = T.LoadUnsafe(ref pBuffer); + v1 = T.LoadUnsafe(ref pBuffer, width); + v2 = T.LoadUnsafe(ref pLastTwo); + v3 = T.LoadUnsafe(ref pLastTwo, width); + + T pair0 = v0 | v1; + T pair1 = v2 | v3; + + if ((pair0 & T.Create(0x80)) != T.Zero) { - return value.ExtractMostSignificantBits() != 0; + v2 = v0; + v3 = v1; + goto FoundV23; } - - if (Vector256.IsHardwareAccelerated) + if ((pair1 & T.Create(0x80)) != T.Zero) { - return Vector256.GreaterThanOrEqualAny(value, Vector256.Create((byte)0x80)); + offset = bufferLength - (width * 2); + goto FoundV23; } - return Test128(value.GetLower() | value.GetUpper()); + return bufferLength; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static bool Test512(Vector512 value) + // SAFETY: We only use the value of .AsPointer to calculate the byte offset + // to the next vector boundary. Should GC relocate the buffer, we will lose the + // alignment which is an acceptable trade-off over pinning the buffer, once + // the callers of this method are updated to use byrefs. + nuint align = (nuint)Unsafe.AsPointer(ref pBuffer) % width; + if (align != 0) { - if (Avx512F.IsSupported) + v0 = T.LoadUnsafe(ref pBuffer); + if (T.GreaterThanOrEqualAny(v0, T.Create(0x80))) { - return value.ExtractMostSignificantBits() != 0; + goto FoundV0; } + offset += align; + } - return Test256(value.GetLower() | value.GetUpper()); + nuint last = bufferLength - width * 4; + while (offset <= last) + { + (v0, v1, v2, v3) = Load4(ref pBuffer, offset); + if (!Test4(v0, v1, v2, v3)) + { + offset += width * 4; + continue; + } + + goto FoundV0123; } - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static nuint ComputeIndex(Vector128 value) - { - if (AdvSimd.IsSupported) + (v0, v1, v2, v3) = Load4(ref pBuffer, last); + if (Test4(v0, v1, v2, v3)) { - ulong shrn = AdvSimd - .ShiftRightLogicalNarrowingLower(value.AsUInt16(), 4) - .AsUInt64() - .ToScalar(); - return (nuint)(ulong.TrailingZeroCount(shrn) / 4); + offset = last; + goto FoundV0123; } - return nuint.TrailingZeroCount(value.ExtractMostSignificantBits()); - } + return bufferLength; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static nuint ComputeIndex(Vector256 value) - { - if (!Vector256.IsHardwareAccelerated) + FoundV0123: + nuint index = IndexOfWhereAllBitsSet2( + T.GreaterThanOrEqual(v0, T.Create(0x80)), + T.GreaterThanOrEqual(v1, T.Create(0x80))); + + if (index < width * 2) + { + return index + offset; + } + offset += width * 2; + + FoundV23: + return offset + IndexOfWhereAllBitsSet2( + T.GreaterThanOrEqual(v2, T.Create(0x80)), + T.GreaterThanOrEqual(v3, T.Create(0x80))); + + FoundV0: + return offset + IndexOfWhereAllBitsSet( + T.GreaterThanOrEqual(v0, T.Create(0x80))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static (T, T, T, T) Load4(ref byte pBuffer, nuint offset) { - Vector128 lower = value.GetLower(); - Vector128 upper = value.GetUpper(); - return lower != Vector128.Zero - ? ComputeIndex(lower) - : ComputeIndex(upper) + 16; + // Note: AdvSimd.Arm64.Load4xVector128 does not appear to work + // for the pointer-based implementation, likely due to the interaction + // with generics breaking consecutive register allocation. + ref byte pCurrent = ref Unsafe.Add(ref pBuffer, offset); + return ( + T.LoadUnsafe(ref pCurrent), + T.LoadUnsafe(ref pCurrent, (nuint)T.Count), + T.LoadUnsafe(ref pCurrent, (nuint)T.Count * 2), + T.LoadUnsafe(ref pCurrent, (nuint)T.Count * 3)); } - return nuint.TrailingZeroCount(value.ExtractMostSignificantBits()); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool Test4(T v0, T v1, T v2, T v3) + { + return ((v0 | v1 | v2 | v3) & T.Create(0x80)) != T.Zero; + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static nuint ComputeIndex(Vector512 value) + private static nuint IndexOfWhereAllBitsSet(T value) + where T : ISimdVector { - if (!Vector512.IsHardwareAccelerated) + return value switch + { + Vector128 v128 => AdvSimd.IsSupported + ? IndexOfWhereAllBitsSet_Arm64(v128) + : nuint.TrailingZeroCount(v128.ExtractMostSignificantBits()), + Vector256 v256 => nuint.TrailingZeroCount(v256.ExtractMostSignificantBits()), + Vector512 v512 => (nuint)ulong.TrailingZeroCount(v512.ExtractMostSignificantBits()), + _ => throw new NotSupportedException(), + }; + + [CompExactlyDependsOn(typeof(AdvSimd))] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static nuint IndexOfWhereAllBitsSet_Arm64(Vector128 value) { - Vector256 lower = value.GetLower(); - Vector256 upper = value.GetUpper(); - return lower != Vector256.Zero - ? ComputeIndex(lower) - : ComputeIndex(upper) + 32; + ulong shrn = AdvSimd + .ShiftRightLogicalNarrowingLower(value.AsUInt16(), 4) + .AsUInt64() + .ToScalar(); + return (nuint)(ulong.TrailingZeroCount(shrn) / 4); } + } - return (nuint)ulong.TrailingZeroCount(value.ExtractMostSignificantBits()); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint IndexOfWhereAllBitsSet2(T v0, T v1) + where T : ISimdVector + { + nuint width = (nuint)T.Count; + nuint offset0 = IndexOfWhereAllBitsSet(v0); + nuint offset1 = IndexOfWhereAllBitsSet(v1); + return offset0 < width ? offset0 : offset1 + width; } /// From dbdb10861cb2742c5002a969b2ffcb956a15aa35 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Mon, 15 Jul 2024 16:34:06 +0300 Subject: [PATCH 05/11] Fix alignment calculation, improve codegen for x86_64 --- .../src/System/Text/Ascii.Utility.cs | 62 +++++++++---------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 13c875f2c00de9..c783e683d5a806 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -181,13 +181,13 @@ static nuint Search16To32(ref byte pBuffer, nuint bufferLength) Vector128 first = Vector128.LoadUnsafe(ref pBuffer); Vector128 last = Vector128.LoadUnsafe(ref pBuffer, bufferLength - 16); - bool foundFirst = Vector128.GreaterThanOrEqualAny(first, mask); - bool foundLast = Vector128.GreaterThanOrEqualAny(last, mask); + bool foundFirst = (first & mask) != Vector128.Zero; + bool foundLast = (last & mask) != Vector128.Zero; return (foundFirst, foundLast) switch { - (true, _) => IndexOfWhereAllBitsSet(Vector128.GreaterThanOrEqual(first, mask)), - (_, true) => IndexOfWhereAllBitsSet(Vector128.GreaterThanOrEqual(last, mask)) + bufferLength - 16, + (true, _) => IndexOfNonAscii(first), + (_, true) => IndexOfNonAscii(last) + bufferLength - 16, _ => bufferLength, }; } @@ -201,13 +201,13 @@ static nuint Search32To64(ref byte pBuffer, nuint bufferLength) Vector256 first = Vector256.LoadUnsafe(ref pBuffer); Vector256 last = Vector256.LoadUnsafe(ref pBuffer, bufferLength - 32); - bool foundFirst = Vector256.GreaterThanOrEqualAny(first, mask); - bool foundLast = Vector256.GreaterThanOrEqualAny(last, mask); + bool foundFirst = (first & mask) != Vector256.Zero; + bool foundLast = (last & mask) != Vector256.Zero; return (foundFirst, foundLast) switch { - (true, _) => IndexOfWhereAllBitsSet(Vector256.GreaterThanOrEqual(first, mask)), - (_, true) => IndexOfWhereAllBitsSet(Vector256.GreaterThanOrEqual(last, mask)) + bufferLength - 32, + (true, _) => IndexOfNonAscii(first), + (_, true) => IndexOfNonAscii(last) + bufferLength - 32, _ => bufferLength, }; } @@ -280,8 +280,8 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled2(ref byte pB v0 = T.LoadUnsafe(ref pBuffer); v1 = T.LoadUnsafe(ref pBuffer, bufferLength - width); - bool foundFirst = T.GreaterThanOrEqualAny(v0, T.Create(0x80)); - bool foundLast = T.GreaterThanOrEqualAny(v1, T.Create(0x80)); + bool foundFirst = (v0 & T.Create(0x80)) != T.Zero; + bool foundLast = (v1 & T.Create(0x80)) != T.Zero; if (foundFirst) { @@ -305,11 +305,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled2(ref byte pB if (align != 0) { v1 = T.LoadUnsafe(ref pBuffer); - if (T.GreaterThanOrEqualAny(v1, T.Create(0x80))) + if ((v1 & T.Create(0x80)) != T.Zero) { goto FoundV1; } - offset += align; + offset += width - align; } nuint last = bufferLength - width * 2; @@ -337,8 +337,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled2(ref byte pB return bufferLength; FoundV01: - nuint index = IndexOfWhereAllBitsSet( - T.GreaterThanOrEqual(v0, T.Create(0x80))); + nuint index = IndexOfNonAscii(v0); if (index < width) { @@ -347,8 +346,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled2(ref byte pB offset += width; FoundV1: - return offset + IndexOfWhereAllBitsSet( - T.GreaterThanOrEqual(v1, T.Create(0x80))); + return offset + IndexOfNonAscii(v1); [MethodImpl(MethodImplOptions.AggressiveInlining)] static bool Test2(T v0, T v1) @@ -375,8 +373,8 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled4(ref byte pB v0 = T.LoadUnsafe(ref pBuffer); v1 = T.LoadUnsafe(ref pBuffer, bufferLength - width); - bool foundFirst = T.GreaterThanOrEqualAny(v0, T.Create(0x80)); - bool foundLast = T.GreaterThanOrEqualAny(v1, T.Create(0x80)); + bool foundFirst = (v0 & T.Create(0x80)) != T.Zero; + bool foundLast = (v1 & T.Create(0x80)) != T.Zero; if (foundFirst) { @@ -428,11 +426,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled4(ref byte pB if (align != 0) { v0 = T.LoadUnsafe(ref pBuffer); - if (T.GreaterThanOrEqualAny(v0, T.Create(0x80))) + if ((v0 & T.Create(0x80)) != T.Zero) { goto FoundV0; } - offset += align; + offset += width - align; } nuint last = bufferLength - width * 4; @@ -458,9 +456,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled4(ref byte pB return bufferLength; FoundV0123: - nuint index = IndexOfWhereAllBitsSet2( - T.GreaterThanOrEqual(v0, T.Create(0x80)), - T.GreaterThanOrEqual(v1, T.Create(0x80))); + nuint index = IndexOfNonAscii2(v0, v1); if (index < width * 2) { @@ -469,13 +465,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled4(ref byte pB offset += width * 2; FoundV23: - return offset + IndexOfWhereAllBitsSet2( - T.GreaterThanOrEqual(v2, T.Create(0x80)), - T.GreaterThanOrEqual(v3, T.Create(0x80))); + return offset + IndexOfNonAscii2(v2, v3); FoundV0: - return offset + IndexOfWhereAllBitsSet( - T.GreaterThanOrEqual(v0, T.Create(0x80))); + return offset + IndexOfNonAscii(v0); [MethodImpl(MethodImplOptions.AggressiveInlining)] static (T, T, T, T) Load4(ref byte pBuffer, nuint offset) @@ -499,13 +492,13 @@ static bool Test4(T v0, T v1, T v2, T v3) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static nuint IndexOfWhereAllBitsSet(T value) + private static nuint IndexOfNonAscii(T value) where T : ISimdVector { return value switch { Vector128 v128 => AdvSimd.IsSupported - ? IndexOfWhereAllBitsSet_Arm64(v128) + ? IndexOfNonAsciiArm64(v128) : nuint.TrailingZeroCount(v128.ExtractMostSignificantBits()), Vector256 v256 => nuint.TrailingZeroCount(v256.ExtractMostSignificantBits()), Vector512 v512 => (nuint)ulong.TrailingZeroCount(v512.ExtractMostSignificantBits()), @@ -514,8 +507,9 @@ private static nuint IndexOfWhereAllBitsSet(T value) [CompExactlyDependsOn(typeof(AdvSimd))] [MethodImpl(MethodImplOptions.AggressiveInlining)] - static nuint IndexOfWhereAllBitsSet_Arm64(Vector128 value) + static nuint IndexOfNonAsciiArm64(Vector128 value) { + value = Vector128.GreaterThanOrEqual(value, Vector128.Create((byte)0x80)); ulong shrn = AdvSimd .ShiftRightLogicalNarrowingLower(value.AsUInt16(), 4) .AsUInt64() @@ -525,12 +519,12 @@ static nuint IndexOfWhereAllBitsSet_Arm64(Vector128 value) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static nuint IndexOfWhereAllBitsSet2(T v0, T v1) + private static nuint IndexOfNonAscii2(T v0, T v1) where T : ISimdVector { nuint width = (nuint)T.Count; - nuint offset0 = IndexOfWhereAllBitsSet(v0); - nuint offset1 = IndexOfWhereAllBitsSet(v1); + nuint offset0 = IndexOfNonAscii(v0); + nuint offset1 = IndexOfNonAscii(v1); return offset0 < width ? offset0 : offset1 + width; } From 9decfefde5af809af50c886ef426c88ddaf18185 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Mon, 15 Jul 2024 18:47:19 +0300 Subject: [PATCH 06/11] Deduplicate helpers --- .../src/System/Text/Ascii.Utility.cs | 43 ++++++------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index c783e683d5a806..44c0538afc1338 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -146,13 +146,13 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer { if (bufferLength <= (nuint)Vector256.Count) { - return Search16To32(ref pBuffer, bufferLength); + return SearchTwo>(ref pBuffer, bufferLength); } if (Vector512.IsHardwareAccelerated) { if (bufferLength <= (nuint)Vector512.Count) { - return Search32To64(ref pBuffer, bufferLength); + return SearchTwo>(ref pBuffer, bufferLength); } // This is as wide as newest CPU cores can go at the time of writing. @@ -172,42 +172,23 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer Done: return index; - static nuint Search16To32(ref byte pBuffer, nuint bufferLength) + static nuint SearchTwo(ref byte pBuffer, nuint bufferLength) + where T : ISimdVector { - Debug.Assert(bufferLength is > 16 and <= 32); - Debug.Assert(Vector128.IsHardwareAccelerated); + Debug.Assert(T.IsHardwareAccelerated); + Debug.Assert(bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2); - Vector128 mask = Vector128.Create((byte)0x80); - Vector128 first = Vector128.LoadUnsafe(ref pBuffer); - Vector128 last = Vector128.LoadUnsafe(ref pBuffer, bufferLength - 16); + T mask = T.Create(0x80); + T first = T.LoadUnsafe(ref pBuffer); + T last = T.LoadUnsafe(ref pBuffer, bufferLength - (nuint)T.Count); - bool foundFirst = (first & mask) != Vector128.Zero; - bool foundLast = (last & mask) != Vector128.Zero; + bool foundFirst = (first & mask) != T.Zero; + bool foundLast = (last & mask) != T.Zero; return (foundFirst, foundLast) switch { (true, _) => IndexOfNonAscii(first), - (_, true) => IndexOfNonAscii(last) + bufferLength - 16, - _ => bufferLength, - }; - } - - static nuint Search32To64(ref byte pBuffer, nuint bufferLength) - { - Debug.Assert(bufferLength is > 32 and <= 64); - Debug.Assert(Vector256.IsHardwareAccelerated); - - Vector256 mask = Vector256.Create((byte)0x80); - Vector256 first = Vector256.LoadUnsafe(ref pBuffer); - Vector256 last = Vector256.LoadUnsafe(ref pBuffer, bufferLength - 32); - - bool foundFirst = (first & mask) != Vector256.Zero; - bool foundLast = (last & mask) != Vector256.Zero; - - return (foundFirst, foundLast) switch - { - (true, _) => IndexOfNonAscii(first), - (_, true) => IndexOfNonAscii(last) + bufferLength - 32, + (_, true) => IndexOfNonAscii(last) + bufferLength - (nuint)T.Count, _ => bufferLength, }; } From e9bd242e5eb7c3c30be2ba7e4ee76047870bde08 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Tue, 16 Jul 2024 04:23:40 +0300 Subject: [PATCH 07/11] Temporarily extend asserts --- .../src/System/Text/Ascii.Utility.cs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 44c0538afc1338..5c7c2364dd1496 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -141,15 +141,16 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) { - // TODO: Handle length < 32 here. if (Vector256.IsHardwareAccelerated) { + // Handle lengths 17-32. if (bufferLength <= (nuint)Vector256.Count) { return SearchTwo>(ref pBuffer, bufferLength); } if (Vector512.IsHardwareAccelerated) { + // Handle lengths 33-64. if (bufferLength <= (nuint)Vector512.Count) { return SearchTwo>(ref pBuffer, bufferLength); @@ -175,8 +176,12 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer static nuint SearchTwo(ref byte pBuffer, nuint bufferLength) where T : ISimdVector { - Debug.Assert(T.IsHardwareAccelerated); - Debug.Assert(bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2); + Debug.Assert( + T.IsHardwareAccelerated, + "Should only be called for hardware-accelerated types."); + Debug.Assert( + bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2, + $"Should only be called for lengths {T.Count + 1}-{T.Count * 2}."); T mask = T.Create(0x80); T first = T.LoadUnsafe(ref pBuffer); @@ -476,6 +481,10 @@ static bool Test4(T v0, T v1, T v2, T v3) private static nuint IndexOfNonAscii(T value) where T : ISimdVector { + Debug.Assert( + T.IsHardwareAccelerated, + "Should only be called for hardware-accelerated types."); + return value switch { Vector128 v128 => AdvSimd.IsSupported From d29c1f300b02a81b0d82aa814917e39daf827d95 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Tue, 16 Jul 2024 14:49:29 +0300 Subject: [PATCH 08/11] Temporarily comment out the assert --- .../src/System/Text/Ascii.Utility.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 5c7c2364dd1496..5c3d57fe9be5ea 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -176,9 +176,10 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer static nuint SearchTwo(ref byte pBuffer, nuint bufferLength) where T : ISimdVector { - Debug.Assert( - T.IsHardwareAccelerated, - "Should only be called for hardware-accelerated types."); + // Debug.Assert( + // T.IsHardwareAccelerated, + // "Should only be called for hardware-accelerated types."); + // ^ this appears to be incorrectly asserting *exactly* under Vector256.IsHardwareAccelerated guard. Debug.Assert( bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2, $"Should only be called for lengths {T.Count + 1}-{T.Count * 2}."); From 826d205342e07d79099d5b1fffdb6d81c240f26b Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Tue, 16 Jul 2024 19:54:02 +0300 Subject: [PATCH 09/11] Cleanup --- .../src/System/Text/Ascii.Utility.cs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 5c3d57fe9be5ea..cd083ebf80623e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -176,13 +176,9 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer static nuint SearchTwo(ref byte pBuffer, nuint bufferLength) where T : ISimdVector { - // Debug.Assert( - // T.IsHardwareAccelerated, - // "Should only be called for hardware-accelerated types."); - // ^ this appears to be incorrectly asserting *exactly* under Vector256.IsHardwareAccelerated guard. - Debug.Assert( - bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2, - $"Should only be called for lengths {T.Count + 1}-{T.Count * 2}."); + // This is currently disabled due to incorrectly asserting on R2R builds. + // Debug.Assert(T.IsHardwareAccelerated); + Debug.Assert(bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2); T mask = T.Create(0x80); T first = T.LoadUnsafe(ref pBuffer); @@ -482,9 +478,7 @@ static bool Test4(T v0, T v1, T v2, T v3) private static nuint IndexOfNonAscii(T value) where T : ISimdVector { - Debug.Assert( - T.IsHardwareAccelerated, - "Should only be called for hardware-accelerated types."); + Debug.Assert(T.IsHardwareAccelerated); return value switch { From 423eb39120194ef927d271c2ccd439e8fa5e6ca9 Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Wed, 17 Jul 2024 21:10:25 +0300 Subject: [PATCH 10/11] Handle big endian correctly in the scalar path and remove generic SIMD asserts (for now) --- .../src/System/Text/Ascii.Utility.cs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index cd083ebf80623e..1f612d48e78615 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -77,9 +77,10 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer if (bufferLength < 8) { uint found4; + const uint mask = 0x80808080; if ((bufferLength & 4) != 0) { - uint test4 = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x80808080; + uint test4 = Unsafe.ReadUnaligned(ref pBuffer) & mask; if (test4 != 0) { found4 = test4; @@ -89,7 +90,7 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer } if ((bufferLength & 2) != 0) { - uint test2 = (uint)Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & 0x8080; + uint test2 = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & mask; if (test2 != 0) { found4 = test2; @@ -108,7 +109,9 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer goto Done; Found1to7: - index += uint.TrailingZeroCount(found4) / 8; + index += BitConverter.IsLittleEndian + ? uint.TrailingZeroCount(found4) / 8 + : uint.LeadingZeroCount(found4) / 8; goto Done; } @@ -176,8 +179,6 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer static nuint SearchTwo(ref byte pBuffer, nuint bufferLength) where T : ISimdVector { - // This is currently disabled due to incorrectly asserting on R2R builds. - // Debug.Assert(T.IsHardwareAccelerated); Debug.Assert(bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2); T mask = T.Create(0x80); @@ -249,7 +250,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Scalar(ref byte pBuffer, private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled2(ref byte pBuffer, nuint bufferLength) where T : ISimdVector { - Debug.Assert(T.IsHardwareAccelerated); Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(bufferLength > (nuint)T.Count); @@ -342,7 +342,6 @@ static bool Test2(T v0, T v1) private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled4(ref byte pBuffer, nuint bufferLength) where T : ISimdVector { - Debug.Assert(T.IsHardwareAccelerated); Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(bufferLength > (nuint)T.Count); @@ -478,8 +477,6 @@ static bool Test4(T v0, T v1, T v2, T v3) private static nuint IndexOfNonAscii(T value) where T : ISimdVector { - Debug.Assert(T.IsHardwareAccelerated); - return value switch { Vector128 v128 => AdvSimd.IsSupported From 07d9b36c05cd01a2cced961b5f41d252925256cf Mon Sep 17 00:00:00 2001 From: neon-sunset Date: Thu, 18 Jul 2024 21:04:14 +0300 Subject: [PATCH 11/11] Last one fix for Big Endian --- .../src/System/Text/Ascii.Utility.cs | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 1f612d48e78615..78979420aae326 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -76,24 +76,25 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer nuint index = 0; if (bufferLength < 8) { - uint found4; const uint mask = 0x80808080; + + uint found; if ((bufferLength & 4) != 0) { - uint test4 = Unsafe.ReadUnaligned(ref pBuffer) & mask; - if (test4 != 0) + uint test = Unsafe.ReadUnaligned(ref pBuffer) & mask; + if (test != 0) { - found4 = test4; + found = test; goto Found1to7; } index = 4; } if ((bufferLength & 2) != 0) { - uint test2 = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & mask; - if (test2 != 0) + uint test = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & mask; + if (test != 0) { - found4 = test2; + found = test; goto Found1to7; } index += 2; @@ -110,20 +111,22 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer Found1to7: index += BitConverter.IsLittleEndian - ? uint.TrailingZeroCount(found4) / 8 - : uint.LeadingZeroCount(found4) / 8; + ? uint.TrailingZeroCount(found) / 8 + : uint.LeadingZeroCount(found) / 8; goto Done; } - ulong found8; if (bufferLength <= 16) { + const ulong mask = 0x8080808080808080; + ulong first = Unsafe.ReadUnaligned(ref pBuffer); ulong last = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, bufferLength - 8)); - first &= 0x8080808080808080; - last &= 0x8080808080808080; + first &= mask; + last &= mask; + ulong found8; if (first != 0) { found8 = first; @@ -139,7 +142,9 @@ internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint buffer goto Done; } - return index + (nuint)(ulong.TrailingZeroCount(found8) / 8); + return index + (nuint)(BitConverter.IsLittleEndian + ? ulong.TrailingZeroCount(found8) / 8 + : ulong.LeadingZeroCount(found8) / 8); } if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)