diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs index 0e652fbcee59ac..526bcb43045a1a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs @@ -532,7 +532,8 @@ private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int { // Unrecognized fallback mechanism - count bytes manually. - charCount = (int)Ascii.GetIndexOfFirstNonAsciiByte(pBytes, (uint)bytesLength); + // TODO: Migrate to byref + charCount = (int)Ascii.GetIndexOfFirstNonAsciiByte(ref Unsafe.AsRef(pBytes), (uint)bytesLength); } bytesConsumed = charCount; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 510c06d0311c5a..0f06684bcecb48 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -53,29 +53,6 @@ private static bool AllCharsInUInt64AreAscii(ulong value) : AllCharsInUInt64AreAscii(value); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] - private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 value, Vector128 bitmask) - { - if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) - { - throw new PlatformNotSupportedException(); - } - - // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2))); - Vector128 mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); - Vector128 extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitmask); - - // collapse mask to lower bits - extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); - ulong mask = extractedBits.AsUInt64().ToScalar(); - - // calculate the index - int index = BitOperations.TrailingZeroCount(mask) >> 2; - Debug.Assert((mask != 0) ? index < 16 : index >= 16); - return index; - } - /// /// Given a DWORD which represents two packed chars in machine-endian order, /// iff the first char (in machine-endian order) is ASCII. @@ -94,613 +71,448 @@ private static bool FirstCharInUInt32IsAscii(uint value) /// Returns if the buffer is empty or all-ASCII. /// /// An ASCII byte is defined as 0x00 - 0x7F, inclusive. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength) + internal static nuint GetIndexOfFirstNonAsciiByte(ref byte pBuffer, nuint bufferLength) { - // If 256/512-bit aren't supported but SSE2 is supported, use those specific intrinsics instead of - // the generic vectorized code. This has two benefits: (a) we can take advantage of specific instructions - // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while - // this method is running. - - if (!Vector512.IsHardwareAccelerated && - !Vector256.IsHardwareAccelerated && - (Sse2.IsSupported || AdvSimd.IsSupported)) + nuint index = 0; + if (bufferLength < 8) { - return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength); - } - else - { - // Handles Vector512, Vector256, Vector128, and scalar. - return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); - } - } - - private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nuint bufferLength) - { - // Squirrel away the original buffer reference. This method works by determining the exact - // byte reference where non-ASCII data begins, so we need this base value to perform the - // final subtraction at the end of the method to get the index into the original buffer. - - byte* pOriginalBuffer = pBuffer; + const uint mask = 0x80808080; - // Before we drain off byte-by-byte, try a generic vectorized loop. - // Only run the loop if we have at least two vectors we can pull out. - // Note use of SBYTE instead of BYTE below; we're using the two's-complement - // representation of negative integers to act as a surrogate for "is ASCII?". - - if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) - { - if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) + uint found; + if ((bufferLength & 4) != 0) { - // The first several elements of the input buffer were ASCII. Bump up the pointer to the - // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII - // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - - byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector512.Size; - pBuffer = (byte*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1)); - -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= Vector512.Size, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif - - Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); - - do + uint test = Unsafe.ReadUnaligned(ref pBuffer) & mask; + if (test != 0) { - Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); - if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) - { - break; // found non-ASCII data - } - - pBuffer += Vector512.Size; - } while (pBuffer <= pFinalVectorReadPos); - - // Adjust the remaining buffer length for the number of elements we just consumed. - - bufferLength -= (nuint)pBuffer; - bufferLength += (nuint)pOriginalBuffer; + found = test; + goto Found1to7; + } + index = 4; } - } - else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) - { - if (Vector256.Load(pBuffer).ExtractMostSignificantBits() == 0) + if ((bufferLength & 2) != 0) { - // The first several elements of the input buffer were ASCII. Bump up the pointer to the - // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII - // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - - byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector256.Size; - pBuffer = (byte*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1)); - -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= Vector256.Size, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif - - Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); - - do + uint test = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, index)) & mask; + if (test != 0) { - Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); - if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) - { - break; // found non-ASCII data - } - - pBuffer += Vector256.Size; - } while (pBuffer <= pFinalVectorReadPos); - - // Adjust the remaining buffer length for the number of elements we just consumed. - - bufferLength -= (nuint)pBuffer; - bufferLength += (nuint)pOriginalBuffer; + found = test; + goto Found1to7; + } + index += 2; } - } - else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) - { - if (!VectorContainsNonAsciiChar(Vector128.Load(pBuffer))) + if ((bufferLength & 1) != 0) { - // The first several elements of the input buffer were ASCII. Bump up the pointer to the - // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII - // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - - byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector128.Size; - pBuffer = (byte*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); - -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= Vector128.Size, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif - - Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); - - do + if (Unsafe.Add(ref pBuffer, index) < 0x80) { - Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned."); - if (VectorContainsNonAsciiChar(Vector128.LoadAligned(pBuffer))) - { - break; // found non-ASCII data - } - - pBuffer += Vector128.Size; - } while (pBuffer <= pFinalVectorReadPos); - - // Adjust the remaining buffer length for the number of elements we just consumed. - - bufferLength -= (nuint)pBuffer; - bufferLength += (nuint)pOriginalBuffer; + index++; + } } - } - // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform - // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code - // path to drain any remaining ASCII bytes. - // - // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads. - // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes. - - uint currentUInt32; + goto Done; - // Try reading 64 bits at a time in a loop. + Found1to7: + index += BitConverter.IsLittleEndian + ? uint.TrailingZeroCount(found) / 8 + : uint.LeadingZeroCount(found) / 8; + goto Done; + } - for (; bufferLength >= 8; bufferLength -= 8) + if (bufferLength <= 16) { - currentUInt32 = Unsafe.ReadUnaligned(pBuffer); - uint nextUInt32 = Unsafe.ReadUnaligned(pBuffer + 4); + const ulong mask = 0x8080808080808080; - if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32)) - { - // One of these two values contains non-ASCII bytes. - // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes. + ulong first = Unsafe.ReadUnaligned(ref pBuffer); + ulong last = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, bufferLength - 8)); - if (AllBytesInUInt32AreAscii(currentUInt32)) - { - currentUInt32 = nextUInt32; - pBuffer += 4; - } + first &= mask; + last &= mask; - goto FoundNonAsciiData; + ulong found8; + if (first != 0) + { + found8 = first; } - - pBuffer += 8; // consumed 8 ASCII bytes - } - - // From this point forward we don't need to update bufferLength. - // Try reading 32 bits. - - if ((bufferLength & 4) != 0) - { - currentUInt32 = Unsafe.ReadUnaligned(pBuffer); - if (!AllBytesInUInt32AreAscii(currentUInt32)) + else if (last != 0) { - goto FoundNonAsciiData; + found8 = last; + index = bufferLength - 8; + } + else + { + index = bufferLength; + goto Done; } - pBuffer += 4; + return index + (nuint)(BitConverter.IsLittleEndian + ? ulong.TrailingZeroCount(found8) / 8 + : ulong.LeadingZeroCount(found8) / 8); } - // Try reading 16 bits. - - if ((bufferLength & 2) != 0) + if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) { - currentUInt32 = Unsafe.ReadUnaligned(pBuffer); - if (!AllBytesInUInt32AreAscii(currentUInt32)) + if (Vector256.IsHardwareAccelerated) { - if (!BitConverter.IsLittleEndian) + // Handle lengths 17-32. + if (bufferLength <= (nuint)Vector256.Count) { - currentUInt32 <<= 16; + return SearchTwo>(ref pBuffer, bufferLength); } - goto FoundNonAsciiData; - } + if (Vector512.IsHardwareAccelerated) + { + // Handle lengths 33-64. + if (bufferLength <= (nuint)Vector512.Count) + { + return SearchTwo>(ref pBuffer, bufferLength); + } - pBuffer += 2; - } + // This is as wide as newest CPU cores can go at the time of writing. + return GetIndexOfFirstNonAsciiByte_Unrolled2>(ref pBuffer, bufferLength); + } - // Try reading 8 bits + return GetIndexOfFirstNonAsciiByte_Unrolled4>(ref pBuffer, bufferLength); + } - if ((bufferLength & 1) != 0) + return GetIndexOfFirstNonAsciiByte_Unrolled4>(ref pBuffer, bufferLength); + } + else { - // If the buffer contains non-ASCII data, the comparison below will fail, and - // we'll end up not incrementing the buffer reference. - - if (*(sbyte*)pBuffer >= 0) - { - pBuffer++; - } + return GetIndexOfFirstNonAsciiByte_Scalar(ref pBuffer, bufferLength); } - Finish: - - nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer; - return totalNumBytesRead; - - FoundNonAsciiData: - - Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input."); + Done: + return index; - // The method being called doesn't bother looking at whether the high byte is ASCII. There are only - // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before - // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be - // non-ASCII. In both cases we only care about the low 24 bits. + static nuint SearchTwo(ref byte pBuffer, nuint bufferLength) + where T : ISimdVector + { + Debug.Assert(bufferLength > (nuint)T.Count && bufferLength <= (nuint)T.Count * 2); - pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32); - goto Finish; - } + T mask = T.Create(0x80); + T first = T.LoadUnsafe(ref pBuffer); + T last = T.LoadUnsafe(ref pBuffer, bufferLength - (nuint)T.Count); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool ContainsNonAsciiByte_Sse2(uint sseMask) - { - Debug.Assert(sseMask != uint.MaxValue); - Debug.Assert(Sse2.IsSupported); - return sseMask != 0; - } + bool foundFirst = (first & mask) != T.Zero; + bool foundLast = (last & mask) != T.Zero; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool ContainsNonAsciiByte_AdvSimd(uint advSimdIndex) - { - Debug.Assert(advSimdIndex != uint.MaxValue); - Debug.Assert(AdvSimd.IsSupported); - return advSimdIndex < 16; + return (foundFirst, foundLast) switch + { + (true, _) => IndexOfNonAscii(first), + (_, true) => IndexOfNonAscii(last) + bufferLength - (nuint)T.Count, + _ => bufferLength, + }; + } } - private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuffer, nuint bufferLength) + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Scalar(ref byte pBuffer, nuint bufferLength) { - // JIT turns the below into constants - - uint SizeOfVector128 = (uint)sizeof(Vector128); - nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); - - Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Sse2 or AdvSimd64 required."); - Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 implementation assumes little-endian."); - - Vector128 bitmask = BitConverter.IsLittleEndian ? - Vector128.Create((ushort)0x1001).AsByte() : - Vector128.Create((ushort)0x0110).AsByte(); + Debug.Assert(bufferLength > 16); + Debug.Assert(!Vector128.IsHardwareAccelerated || !BitConverter.IsLittleEndian); - uint currentSseMask = uint.MaxValue, secondSseMask = uint.MaxValue; - uint currentAdvSimdIndex = uint.MaxValue, secondAdvSimdIndex = uint.MaxValue; - byte* pOriginalBuffer = pBuffer; + ulong value; + nuint offset = 0; + const ulong mask = 0x8080808080808080; - // This method is written such that control generally flows top-to-bottom, avoiding - // jumps as much as possible in the optimistic case of a large enough buffer and - // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets - // after all the main logic. - - if (bufferLength < SizeOfVector128) + nuint align = (nuint)Unsafe.AsPointer(ref pBuffer) % 8; + if (align != 0) { - goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead - } - - // Read the first vector unaligned. - - if (Sse2.IsSupported) - { - currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load - if (ContainsNonAsciiByte_Sse2(currentSseMask)) + value = Unsafe.ReadUnaligned(ref pBuffer) & mask; + if (value != 0) { - goto FoundNonAsciiDataInCurrentChunk; + goto Found; } + offset += 8 - align; } - else if (AdvSimd.Arm64.IsSupported) + + nuint last = bufferLength - 8; + while (offset <= last) { - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + value = Unsafe.As(ref Unsafe.Add(ref pBuffer, offset)) & mask; + if (value != 0) { - goto FoundNonAsciiDataInCurrentChunk; + goto Found; } + offset += 8; } - else - { - throw new PlatformNotSupportedException(); - } - - // If we have less than 32 bytes to process, just go straight to the final unaligned - // read. There's no need to mess with the loop logic in the middle of this method. - if (bufferLength < 2 * SizeOfVector128) + value = Unsafe.ReadUnaligned(ref Unsafe.Add(ref pBuffer, last)) & mask; + if (value != 0) { - goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead; + offset = last; + goto Found; } - // Now adjust the read pointer so that future reads are aligned. + return bufferLength; - pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128); + Found: + return offset + (BitConverter.IsLittleEndian + ? (nuint)BitOperations.TrailingZeroCount(value) / 8 + : (nuint)BitOperations.LeadingZeroCount(value) / 8); + } -#if DEBUG - long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte."); - Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); -#endif + // Another case where we want to bypass Dynamic PGO - it is very easy to hit + // a condition where a profile with longer path blocks marked as cold leads + // to TestXXX helpers prevented from being inlined, which is disastrous for performance. + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled2(ref byte pBuffer, nuint bufferLength) + where T : ISimdVector + { + Debug.Assert(BitConverter.IsLittleEndian); + Debug.Assert(bufferLength > (nuint)T.Count); - // Adjust the remaining length to account for what we just read. + nuint offset = 0; + nuint width = (nuint)T.Count; - bufferLength += (nuint)pOriginalBuffer; - bufferLength -= (nuint)pBuffer; + T v0, v1; - // The buffer is now properly aligned. - // Read 2 vectors at a time if possible. - - if (bufferLength >= 2 * SizeOfVector128) + if (bufferLength <= width * 2) { - byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128); + v0 = T.LoadUnsafe(ref pBuffer); + v1 = T.LoadUnsafe(ref pBuffer, bufferLength - width); - // After this point, we no longer need to update the bufferLength value. + bool foundFirst = (v0 & T.Create(0x80)) != T.Zero; + bool foundLast = (v1 & T.Create(0x80)) != T.Zero; - do + if (foundFirst) { - if (Sse2.IsSupported) - { - Vector128 firstVector = Sse2.LoadAlignedVector128(pBuffer); - Vector128 secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128); - - currentSseMask = (uint)Sse2.MoveMask(firstVector); - secondSseMask = (uint)Sse2.MoveMask(secondVector); - if (ContainsNonAsciiByte_Sse2(currentSseMask | secondSseMask)) - { - goto FoundNonAsciiDataInInnerLoop; - } - } - else if (AdvSimd.Arm64.IsSupported) - { - Vector128 firstVector = AdvSimd.LoadVector128(pBuffer); - Vector128 secondVector = AdvSimd.LoadVector128(pBuffer + SizeOfVector128); - - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(firstVector, bitmask); - secondAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(secondVector, bitmask); - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex) || ContainsNonAsciiByte_AdvSimd(secondAdvSimdIndex)) - { - goto FoundNonAsciiDataInInnerLoop; - } - } - else - { - throw new PlatformNotSupportedException(); - } - - pBuffer += 2 * SizeOfVector128; - } while (pBuffer <= pFinalVectorReadPos); - } - - // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from. - // Since the above loop doesn't update bufferLength, we can't rely on its absolute value. - // But we _can_ rely on it to tell us how much remaining data must be drained by looking - // at what bits of it are set. This works because had we updated it within the loop above, - // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about - // bits which are less significant than those that the addition would've acted on. - - // If there is fewer than one vector length remaining, skip the next aligned read. + v1 = v0; + goto FoundV1; + } + if (foundLast) + { + offset = bufferLength - width; + goto FoundV1; + } - if ((bufferLength & SizeOfVector128) == 0) - { - goto DoFinalUnalignedVectorRead; + return bufferLength; } - // At least one full vector's worth of data remains, so we can safely read it. - // Remember, at this point pBuffer is still aligned. - - if (Sse2.IsSupported) + // SAFETY: We only use the value of .AsPointer to calculate the byte offset + // to the next vector boundary. Should GC relocate the buffer, we will lose the + // alignment which is an acceptable trade-off over pinning the buffer, once + // the callers of this method are updated to use byrefs. + nuint align = (nuint)Unsafe.AsPointer(ref pBuffer) % width; + if (align != 0) { - currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer)); - if (ContainsNonAsciiByte_Sse2(currentSseMask)) + v1 = T.LoadUnsafe(ref pBuffer); + if ((v1 & T.Create(0x80)) != T.Zero) { - goto FoundNonAsciiDataInCurrentChunk; + goto FoundV1; } + offset += width - align; } - else if (AdvSimd.Arm64.IsSupported) + + nuint last = bufferLength - width * 2; + while (offset <= last) { - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + v0 = T.LoadUnsafe(ref pBuffer, offset); + v1 = T.LoadUnsafe(ref pBuffer, offset + width); + if (!Test2(v0, v1)) { - goto FoundNonAsciiDataInCurrentChunk; + offset += width * 2; + continue; } + + goto FoundV01; } - else + + v0 = T.LoadUnsafe(ref pBuffer, last); + v1 = T.LoadUnsafe(ref pBuffer, last + width); + if (Test2(v0, v1)) { - throw new PlatformNotSupportedException(); + offset = last; + goto FoundV01; } - IncrementCurrentOffsetBeforeFinalUnalignedVectorRead: - - pBuffer += SizeOfVector128; + return bufferLength; - DoFinalUnalignedVectorRead: + FoundV01: + nuint index = IndexOfNonAscii(v0); - if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0) + if (index < width) { - // Perform an unaligned read of the last vector. - // We need to adjust the pointer because we're re-reading data. - - pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128; - - if (Sse2.IsSupported) - { - currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load - if (ContainsNonAsciiByte_Sse2(currentSseMask)) - { - goto FoundNonAsciiDataInCurrentChunk; - } - - } - else if (AdvSimd.Arm64.IsSupported) - { - currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load - if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) - { - goto FoundNonAsciiDataInCurrentChunk; - } + return index + offset; + } + offset += width; - } - else - { - throw new PlatformNotSupportedException(); - } + FoundV1: + return offset + IndexOfNonAscii(v1); - pBuffer += SizeOfVector128; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool Test2(T v0, T v1) + { + return ((v0 | v1) & T.Create(0x80)) != T.Zero; } + } - Finish: - return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done! + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Unrolled4(ref byte pBuffer, nuint bufferLength) + where T : ISimdVector + { + Debug.Assert(BitConverter.IsLittleEndian); + Debug.Assert(bufferLength > (nuint)T.Count); - FoundNonAsciiDataInInnerLoop: + nuint offset = 0; + nuint width = (nuint)T.Count; - // If the current (first) mask isn't the mask that contains non-ASCII data, then it must - // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes - // from the second mask. + T v0, v1, v2, v3; - if (Sse2.IsSupported) + if (bufferLength <= width * 2) { - if (!ContainsNonAsciiByte_Sse2(currentSseMask)) + v0 = T.LoadUnsafe(ref pBuffer); + v1 = T.LoadUnsafe(ref pBuffer, bufferLength - width); + + bool foundFirst = (v0 & T.Create(0x80)) != T.Zero; + bool foundLast = (v1 & T.Create(0x80)) != T.Zero; + + if (foundFirst) { - pBuffer += SizeOfVector128; - currentSseMask = secondSseMask; + goto FoundV0; } - } - else if (AdvSimd.IsSupported) - { - if (!ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + if (foundLast) { - pBuffer += SizeOfVector128; - currentAdvSimdIndex = secondAdvSimdIndex; + offset = bufferLength - width; + v0 = v1; + goto FoundV0; } - } - else - { - throw new PlatformNotSupportedException(); - } - FoundNonAsciiDataInCurrentChunk: - - if (Sse2.IsSupported) - { - // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte. - // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't - // available, we'll fall back to a normal loop. - Debug.Assert(ContainsNonAsciiByte_Sse2(currentSseMask), "Shouldn't be here unless we see non-ASCII data."); - pBuffer += (uint)BitOperations.TrailingZeroCount(currentSseMask); - } - else if (AdvSimd.Arm64.IsSupported) - { - Debug.Assert(ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex), "Shouldn't be here unless we see non-ASCII data."); - pBuffer += currentAdvSimdIndex; - } - else - { - throw new PlatformNotSupportedException(); + return bufferLength; } - goto Finish; - - FoundNonAsciiDataInCurrentDWord: - - uint currentDWord; - Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data."); - pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord); - - goto Finish; - - InputBufferLessThanOneVectorInLength: - - // These code paths get hit if the original input length was less than one vector in size. - // We can't perform vectorized reads at this point, so we'll fall back to reading primitives - // directly. Note that all of these reads are unaligned. - - Debug.Assert(bufferLength < SizeOfVector128); - - // QWORD drain - - if ((bufferLength & 8) != 0) + if (bufferLength <= width * 4) { - if (UIntPtr.Size == sizeof(ulong)) - { - // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it. + // SAFETY: Use byref arithmetic over offset-based load to ensure + // that the last two vectors are loaded with a single LDP on ARM64. + ref byte pLastTwo = ref Unsafe.Add(ref pBuffer, bufferLength - width * 2); + v0 = T.LoadUnsafe(ref pBuffer); + v1 = T.LoadUnsafe(ref pBuffer, width); + v2 = T.LoadUnsafe(ref pLastTwo); + v3 = T.LoadUnsafe(ref pLastTwo, width); - ulong candidateUInt64 = Unsafe.ReadUnaligned(pBuffer); - if (!AllBytesInUInt64AreAscii(candidateUInt64)) - { - // Clear everything but the high bit of each byte, then tzcnt. - // Remember to divide by 8 at the end to convert bit count to byte count. + T pair0 = v0 | v1; + T pair1 = v2 | v3; - candidateUInt64 &= UInt64HighBitsOnlyMask; - pBuffer += (nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3); - goto Finish; - } + if ((pair0 & T.Create(0x80)) != T.Zero) + { + v2 = v0; + v3 = v1; + goto FoundV23; } - else + if ((pair1 & T.Create(0x80)) != T.Zero) { - // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead. - - currentDWord = Unsafe.ReadUnaligned(pBuffer); - uint nextDWord = Unsafe.ReadUnaligned(pBuffer + 4); - - if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord)) - { - // At least one of the values wasn't all-ASCII. - // We need to figure out which one it was and stick it in the currentMask local. - - if (AllBytesInUInt32AreAscii(currentDWord)) - { - currentDWord = nextDWord; // this one is the culprit - pBuffer += 4; - } - - goto FoundNonAsciiDataInCurrentDWord; - } + offset = bufferLength - (width * 2); + goto FoundV23; } - pBuffer += 8; // successfully consumed 8 ASCII bytes + return bufferLength; } - // DWORD drain - - if ((bufferLength & 4) != 0) + // SAFETY: We only use the value of .AsPointer to calculate the byte offset + // to the next vector boundary. Should GC relocate the buffer, we will lose the + // alignment which is an acceptable trade-off over pinning the buffer, once + // the callers of this method are updated to use byrefs. + nuint align = (nuint)Unsafe.AsPointer(ref pBuffer) % width; + if (align != 0) { - currentDWord = Unsafe.ReadUnaligned(pBuffer); + v0 = T.LoadUnsafe(ref pBuffer); + if ((v0 & T.Create(0x80)) != T.Zero) + { + goto FoundV0; + } + offset += width - align; + } - if (!AllBytesInUInt32AreAscii(currentDWord)) + nuint last = bufferLength - width * 4; + while (offset <= last) + { + (v0, v1, v2, v3) = Load4(ref pBuffer, offset); + if (!Test4(v0, v1, v2, v3)) { - goto FoundNonAsciiDataInCurrentDWord; + offset += width * 4; + continue; } - pBuffer += 4; // successfully consumed 4 ASCII bytes + goto FoundV0123; } - // WORD drain - // (We movzx to a DWORD for ease of manipulation.) - - if ((bufferLength & 2) != 0) + (v0, v1, v2, v3) = Load4(ref pBuffer, last); + if (Test4(v0, v1, v2, v3)) { - currentDWord = Unsafe.ReadUnaligned(pBuffer); + offset = last; + goto FoundV0123; + } - if (!AllBytesInUInt32AreAscii(currentDWord)) - { - // We only care about the 0x0080 bit of the value. If it's not set, then we - // increment currentOffset by 1. If it's set, we don't increment it at all. + return bufferLength; - pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1; - goto Finish; - } + FoundV0123: + nuint index = IndexOfNonAscii2(v0, v1); - pBuffer += 2; // successfully consumed 2 ASCII bytes + if (index < width * 2) + { + return index + offset; } + offset += width * 2; - // BYTE drain + FoundV23: + return offset + IndexOfNonAscii2(v2, v3); - if ((bufferLength & 1) != 0) + FoundV0: + return offset + IndexOfNonAscii(v0); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static (T, T, T, T) Load4(ref byte pBuffer, nuint offset) { - // sbyte has non-negative value if byte is ASCII. + // Note: AdvSimd.Arm64.Load4xVector128 does not appear to work + // for the pointer-based implementation, likely due to the interaction + // with generics breaking consecutive register allocation. + ref byte pCurrent = ref Unsafe.Add(ref pBuffer, offset); + return ( + T.LoadUnsafe(ref pCurrent), + T.LoadUnsafe(ref pCurrent, (nuint)T.Count), + T.LoadUnsafe(ref pCurrent, (nuint)T.Count * 2), + T.LoadUnsafe(ref pCurrent, (nuint)T.Count * 3)); + } - if (*(sbyte*)(pBuffer) >= 0) - { - pBuffer++; // successfully consumed a single byte - } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool Test4(T v0, T v1, T v2, T v3) + { + return ((v0 | v1 | v2 | v3) & T.Create(0x80)) != T.Zero; } + } - goto Finish; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint IndexOfNonAscii(T value) + where T : ISimdVector + { + return value switch + { + Vector128 v128 => AdvSimd.IsSupported + ? IndexOfNonAsciiArm64(v128) + : nuint.TrailingZeroCount(v128.ExtractMostSignificantBits()), + Vector256 v256 => nuint.TrailingZeroCount(v256.ExtractMostSignificantBits()), + Vector512 v512 => (nuint)ulong.TrailingZeroCount(v512.ExtractMostSignificantBits()), + _ => throw new NotSupportedException(), + }; + + [CompExactlyDependsOn(typeof(AdvSimd))] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static nuint IndexOfNonAsciiArm64(Vector128 value) + { + value = Vector128.GreaterThanOrEqual(value, Vector128.Create((byte)0x80)); + ulong shrn = AdvSimd + .ShiftRightLogicalNarrowingLower(value.AsUInt16(), 4) + .AsUInt64() + .ToScalar(); + return (nuint)(ulong.TrailingZeroCount(shrn) / 4); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint IndexOfNonAscii2(T v0, T v1) + where T : ISimdVector + { + nuint width = (nuint)T.Count; + nuint offset0 = IndexOfNonAscii(v0); + nuint offset1 = IndexOfNonAscii(v1); + return offset0 < width ? offset0 : offset1 + width; } /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index a542dad72b5c33..b4558be5d3116d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -26,8 +26,9 @@ internal static unsafe partial class Utf8Utility Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + // TODO: Migrate to byref // First, try to drain off as many ASCII bytes as we can from the beginning. - nuint numAsciiBytesCounted = Ascii.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength); + nuint numAsciiBytesCounted = Ascii.GetIndexOfFirstNonAsciiByte(ref Unsafe.AsRef(pInputBuffer), (uint)inputLength); pInputBuffer += numAsciiBytesCounted; // Quick check - did we just end up consuming the entire input buffer?