From 191485c18706dc29a14402c44298993c7ab7219c Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 21 Jul 2020 20:38:16 -0700 Subject: [PATCH] [release/5.0-preview8] ARM64 HWIntrinsic usage in System.Text.Unicode (#39738) * AdvSimd support for System.Text.Unicode.Utf16Utility.GetPointerToFirstInvalidChar (#39050) * AdvSimd support for System.Text.Unicode.Utf16Utility.GetPointerToFirstInvalidChar * Move using directive outside #if. Improve Arm64MoveMask. * Change overloads * UIn64 in Arm64MoveMask * Build error implicit conversion fix * Rename method and use simpler version * Use ShiftRightArithmetic instead of CompareEqual + And. * Remove unnecessary comment * Add missing shims causing Linux build to fail * AdvSimd support for System.Text.Unicode.Utf8Utility.TranscodeToUtf8 (#39041) * AdvSimd support for System.Text.Unicode.Utf8Utility.TranscodeToUtf8 * Readd using to prevent build failure. Add AdvSimd equivalent operation to TestZ. * Inverted condition * Address IsSupported order, improve use ExtractNarrowingSaturated usage * Rename source to result, second argument utf16Data * Improve CompareTest * Add shims causing failures in Linux * Use unsigned version of ExtractNarrowingSaturate, avoid using MinAcross and use MaxPairwise instead * Missing support check for Sse2.X64 * Add missing case for AdvSimd * Use MinPairwise for short * AdvSimd support for System.Text.Unicode.Utf8Utility.GetPointerToFirstInvalidByte (#38653) * AdvSimd support for System.Text.Unicode.Utf8Utility.GetPointerToFirstInvalidByte * Move comment to the top, add shims. * Little endian checks * Use custom MoveMask method for AdvSimd * Address suggestions to improve the AdvSimdMoveMask method * Define initialMask outside MoveMask method * UInt64 in Arm64MoveMask * Add unit test case to verify intrinsics improvement * Avoid casting to smaller integer type * Typo and comment * Use ShiftRightArithmetic instead of CompareEqual + And. Remove test case causing other unit tests to fail. * Use AddPairwise version of GetNotAsciiBytes * Add missing shims causing Linux build to fail * Simplify GetNonAsciiBytes to only one AddPairwise call, shorter bitmask * Respect data type returned by masking method * Address suggestions - assert trailingzerocount and bring back uint mask * Trailing zeroes in AdvSimd need to be divided by 4, and total number should not be larger than 16 * Avoid declaring static field which causes PNSE in Utf8String.Experimental (S.P.Corelib code is used for being NetStandard) * Prefer using nuint for BitConverter.TrailingZeroCount * Fix build failure in net472 debug AdvSimd Utf16Utility (#39652) Co-authored-by: Carlos Sanchez Lopez <1175054+carlossanlop@users.noreply.github.com> --- .../Text/Unicode/Utf16Utility.Validation.cs | 90 ++++++++++++++++--- .../Text/Unicode/Utf8Utility.Transcoding.cs | 64 ++++++++++--- .../Text/Unicode/Utf8Utility.Validation.cs | 64 +++++++++---- .../Runtime/Intrinsics/Intrinsics.Shims.cs | 4 + 4 files changed, 181 insertions(+), 41 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs index 395a887f9b4230..f2df0ccdf53c42 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs @@ -4,6 +4,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using System.Numerics; @@ -78,7 +79,7 @@ static Utf16Utility() long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; - if (Sse2.IsSupported) + if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported) { if (inputLength >= Vector128.Count) { @@ -87,17 +88,34 @@ static Utf16Utility() Vector128 vector8800 = Vector128.Create(unchecked((short)0x8800)); Vector128 vectorZero = Vector128.Zero; + Vector128 bitMask128 = BitConverter.IsLittleEndian ? + Vector128.Create(0x80402010_08040201).AsByte() : + Vector128.Create(0x01020408_10204080).AsByte(); + do { - Vector128 utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned - uint mask; + Vector128 utf16Data; + if (AdvSimd.Arm64.IsSupported) + { + utf16Data = AdvSimd.LoadVector128((ushort*)pInputBuffer); // unaligned + } + else + { + utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned + } Vector128 charIsNonAscii; - if (Sse41.IsSupported) + + if (AdvSimd.Arm64.IsSupported) + { + // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding + // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) + charIsNonAscii = AdvSimd.Min(utf16Data, vector0080); + } + else if (Sse41.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) - charIsNonAscii = Sse41.Min(utf16Data, vector0080); } else @@ -111,16 +129,34 @@ static Utf16Utility() #if DEBUG // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element. - uint debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); + uint debugMask; + if (AdvSimd.Arm64.IsSupported) + { + debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte(), bitMask128); + } + else + { + debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); + } Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'."); #endif // DEBUG // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding // input was 0x0800 <= [value]. This also handles the missing range a few lines above. - Vector128 charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); + Vector128 charIsThreeByteUtf8Encoded; + uint mask; - mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); + if (AdvSimd.IsSupported) + { + charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11)); + mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte(), bitMask128); + } + else + { + charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); + mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); + } // Each even bit of mask will be 1 only if the char was >= 0x0080, // and each odd bit of mask will be 1 only if the char was >= 0x0800. @@ -151,9 +187,16 @@ static Utf16Utility() // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. - - utf16Data = Sse2.Add(utf16Data, vectorA800); - mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); + if (AdvSimd.Arm64.IsSupported) + { + utf16Data = AdvSimd.Add(utf16Data, vectorA800); + mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte(), bitMask128); + } + else + { + utf16Data = Sse2.Add(utf16Data, vectorA800); + mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); + } if (mask != 0) { @@ -178,7 +221,15 @@ static Utf16Utility() // Since 'mask' already has 00 in these positions (since the corresponding char // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions. - uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); + uint mask2; + if (AdvSimd.Arm64.IsSupported) + { + mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte(), bitMask128); + } + else + { + mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); + } // 'lowSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a low surrogate char, @@ -433,5 +484,20 @@ static Utf16Utility() scalarCountAdjustment = tempScalarCountAdjustment; return pInputBuffer; } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint GetNonAsciiBytes(Vector128 value, Vector128 bitMask128) + { + Debug.Assert(AdvSimd.Arm64.IsSupported); + + Vector128 mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); + Vector128 extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128); + + // self-pairwise add until all flags have moved to the first two bytes of the vector + extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); + extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); + extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); + return extractedBits.AsUInt16().ToScalar(); + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index ee9f789a40fab0..b0c1376611b9e7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -6,6 +6,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; #if SYSTEM_PRIVATE_CORELIB @@ -882,7 +883,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt // is not enabled. Unsafe.SkipInit(out Vector128 nonAsciiUtf16DataMask); - if (Sse41.X64.IsSupported) + if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char } @@ -940,10 +941,8 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); - if (Sse41.X64.IsSupported) + if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { - Debug.Assert(BitConverter.IsLittleEndian, "SSE41 requires little-endian."); - // Try reading and writing 8 elements per iteration. uint maxIters = minElementsRemaining / 8; ulong possibleNonAsciiQWord; @@ -952,14 +951,30 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt for (i = 0; (uint)i < maxIters; i++) { utf16Data = Unsafe.ReadUnaligned>(pInputBuffer); - if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask)) + + if (AdvSimd.IsSupported) { - goto LoopTerminatedDueToNonAsciiDataInVectorLocal; - } + Vector128 isUtf16DataNonAscii = AdvSimd.CompareTest(utf16Data, nonAsciiUtf16DataMask); + bool hasNonAsciiDataInVector = AdvSimd.Arm64.MinPairwise(isUtf16DataNonAscii, isUtf16DataNonAscii).AsUInt64().ToScalar() != 0; - // narrow and write + if (hasNonAsciiDataInVector) + { + goto LoopTerminatedDueToNonAsciiDataInVectorLocal; + } - Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64()); + Vector64 lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data); + AdvSimd.Store(pOutputBuffer, lower); + } + else + { + if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask)) + { + goto LoopTerminatedDueToNonAsciiDataInVectorLocal; + } + + // narrow and write + Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64()); + } pInputBuffer += 8; pOutputBuffer += 8; @@ -978,7 +993,16 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt } utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16(); - Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); + + if (AdvSimd.IsSupported) + { + Vector64 lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data); + AdvSimd.StoreSelectedScalar((uint*)pOutputBuffer, lower.AsUInt32(), 0); + } + else + { + Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); + } pInputBuffer += 4; pOutputBuffer += 4; @@ -990,7 +1014,15 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt LoopTerminatedDueToNonAsciiDataInVectorLocal: outputBytesRemaining -= 8 * i; - possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64()); + + if (Sse2.X64.IsSupported) + { + possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64()); + } + else + { + possibleNonAsciiQWord = utf16Data.AsUInt64().ToScalar(); + } // Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector, // then check whether it's all-ASCII. If so, narrow and write to the destination @@ -1000,7 +1032,15 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII { - Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); + if (AdvSimd.IsSupported) + { + Vector64 lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data); + AdvSimd.StoreSelectedScalar((uint*)pOutputBuffer, lower.AsUInt32(), 0); + } + else + { + Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); + } pInputBuffer += 4; pOutputBuffer += 4; outputBytesRemaining -= 4; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 33e5181363920b..7730708030ebaf 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -4,6 +4,8 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; #if SYSTEM_PRIVATE_CORELIB @@ -117,22 +119,35 @@ internal static unsafe partial class Utf8Utility // the alignment check consumes at most a single DWORD.) byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here - uint mask; + nuint trailingZeroCount; + + Vector128 bitMask128 = BitConverter.IsLittleEndian ? + Vector128.Create((ushort)0x1001).AsByte() : + Vector128.Create((ushort)0x0110).AsByte(); do { - if (Sse2.IsSupported) + // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're + // going to perform an unaligned load. We don't necessarily care about aligning + // this because we pessimistically assume we'll encounter non-ASCII data at some + // point in the not-too-distant future (otherwise we would've stayed entirely + // within the all-ASCII vectorized code at the entry to this method). + if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) { - // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're - // going to perform an unaligned load. We don't necessarily care about aligning - // this because we pessimistically assume we'll encounter non-ASCII data at some - // point in the not-too-distant future (otherwise we would've stayed entirely - // within the all-ASCII vectorized code at the entry to this method). - - mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte*)pInputBuffer)); + ulong mask = GetNonAsciiBytes(AdvSimd.LoadVector128(pInputBuffer), bitMask128); + if (mask != 0) + { + trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2; + goto LoopTerminatedEarlyDueToNonAsciiData; + } + } + else if (Sse2.IsSupported) + { + uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer)); if (mask != 0) { - goto Sse2LoopTerminatedEarlyDueToNonAsciiData; + trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask); + goto LoopTerminatedEarlyDueToNonAsciiData; } } else @@ -153,19 +168,20 @@ internal static unsafe partial class Utf8Utility continue; // need to perform a bounds check because we might be running out of data - Sse2LoopTerminatedEarlyDueToNonAsciiData: + LoopTerminatedEarlyDueToNonAsciiData: + // x86 can only be little endian, while ARM can be big or little endian + // so if we reached this label we need to check both combinations are supported + Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported); - Debug.Assert(BitConverter.IsLittleEndian); - Debug.Assert(Sse2.IsSupported); // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit - // for each non-ASCII byte we saw. We can count the number of ASCII bytes, + // for each non-ASCII byte we saw. trailingZeroCount will count the number of ASCII bytes, // bump our input counter by that amount, and resume processing from the // "the first byte is no longer ASCII" portion of the main loop. + // We should not expect a total number of zeroes equal or larger than 16. + Debug.Assert(trailingZeroCount < 16); - Debug.Assert(mask != 0); - - pInputBuffer += BitOperations.TrailingZeroCount(mask); + pInputBuffer += trailingZeroCount; if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) { goto ProcessRemainingBytesSlow; @@ -719,5 +735,19 @@ internal static unsafe partial class Utf8Utility scalarCountAdjustment = tempScalarCountAdjustment; return pInputBuffer; } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bitMask128) + { + if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) + { + throw new PlatformNotSupportedException(); + } + + Vector128 mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); + Vector128 extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128); + extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); + return extractedBits.AsUInt64().ToScalar(); + } } } diff --git a/src/libraries/System.Utf8String.Experimental/src/System/Runtime/Intrinsics/Intrinsics.Shims.cs b/src/libraries/System.Utf8String.Experimental/src/System/Runtime/Intrinsics/Intrinsics.Shims.cs index e5f9284769d6b5..fa4602b33f429f 100644 --- a/src/libraries/System.Utf8String.Experimental/src/System/Runtime/Intrinsics/Intrinsics.Shims.cs +++ b/src/libraries/System.Utf8String.Experimental/src/System/Runtime/Intrinsics/Intrinsics.Shims.cs @@ -8,6 +8,7 @@ internal static class Vector64 public static Vector64 Create(ulong value) => throw new PlatformNotSupportedException(); public static Vector64 CreateScalar(uint value) => throw new PlatformNotSupportedException(); public static Vector64 AsByte(this Vector64 vector) where T : struct => throw new PlatformNotSupportedException(); + public static Vector64 AsUInt32(this Vector64 vector) where T : struct => throw new PlatformNotSupportedException(); } internal readonly struct Vector64 where T : struct @@ -16,11 +17,14 @@ internal readonly struct Vector64 internal static class Vector128 { + public static Vector128 Create(long value) => throw new PlatformNotSupportedException(); public static Vector128 Create(short value) => throw new PlatformNotSupportedException(); + public static Vector128 Create(ulong value) => throw new PlatformNotSupportedException(); public static Vector128 Create(ushort value) => throw new PlatformNotSupportedException(); public static Vector128 CreateScalarUnsafe(ulong value) => throw new PlatformNotSupportedException(); public static Vector128 AsByte(this Vector128 vector) where T : struct => throw new PlatformNotSupportedException(); public static Vector128 AsInt16(this Vector128 vector) where T : struct => throw new PlatformNotSupportedException(); + public static Vector128 AsSByte(this Vector128 vector) where T : struct => throw new PlatformNotSupportedException(); public static Vector128 AsUInt16(this Vector128 vector) where T : struct => throw new PlatformNotSupportedException(); public static Vector128 AsUInt32(this Vector128 vector) where T : struct => throw new PlatformNotSupportedException(); public static Vector128 AsUInt64(this Vector128 vector) where T : struct => throw new PlatformNotSupportedException();