Skip to content

Commit

Permalink
[release/5.0-preview8] ARM64 HWIntrinsic usage in System.Text.Unicode (
Browse files Browse the repository at this point in the history
…#39738)

* AdvSimd support for System.Text.Unicode.Utf16Utility.GetPointerToFirstInvalidChar (#39050)

* AdvSimd support for System.Text.Unicode.Utf16Utility.GetPointerToFirstInvalidChar

* Move using directive outside #if.
Improve Arm64MoveMask.

* Change overloads

* UIn64 in Arm64MoveMask

* Build error implicit conversion fix

* Rename method and use simpler version

* Use ShiftRightArithmetic instead of CompareEqual + And.

* Remove unnecessary comment

* Add missing shims causing Linux build to fail

* AdvSimd support for System.Text.Unicode.Utf8Utility.TranscodeToUtf8 (#39041)

* AdvSimd support for System.Text.Unicode.Utf8Utility.TranscodeToUtf8

* Readd using to prevent build failure.
Add AdvSimd equivalent operation to TestZ.

* Inverted condition

* Address IsSupported order, improve use ExtractNarrowingSaturated usage

* Rename source to result, second argument utf16Data

* Improve CompareTest

* Add shims causing failures in Linux

* Use unsigned version of ExtractNarrowingSaturate, avoid using MinAcross and use MaxPairwise instead

* Missing support check for Sse2.X64

* Add missing case for AdvSimd

* Use MinPairwise for short

* AdvSimd support for System.Text.Unicode.Utf8Utility.GetPointerToFirstInvalidByte (#38653)

* AdvSimd support for System.Text.Unicode.Utf8Utility.GetPointerToFirstInvalidByte

* Move comment to the top, add shims.

* Little endian checks

* Use custom MoveMask method for AdvSimd

* Address suggestions to improve the AdvSimdMoveMask method

* Define initialMask outside MoveMask method

* UInt64 in Arm64MoveMask

* Add unit test case to verify intrinsics improvement

* Avoid casting to smaller integer type

* Typo and comment

* Use ShiftRightArithmetic instead of CompareEqual + And.
Remove test case causing other unit tests to fail.

* Use AddPairwise version of GetNotAsciiBytes

* Add missing shims causing Linux build to fail

* Simplify GetNonAsciiBytes to only one AddPairwise call, shorter bitmask

* Respect data type returned by masking method

* Address suggestions - assert trailingzerocount and bring back uint mask

* Trailing zeroes in AdvSimd need to be divided by 4, and total number should not be larger than 16

* Avoid declaring static field which causes PNSE in Utf8String.Experimental (S.P.Corelib code is used for being NetStandard)

* Prefer using nuint for BitConverter.TrailingZeroCount

* Fix build failure in net472 debug AdvSimd Utf16Utility (#39652)

Co-authored-by: Carlos Sanchez Lopez <[email protected]>
  • Loading branch information
tannergooding and carlossanlop authored Jul 22, 2020
1 parent 498c798 commit 191485c
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using System.Numerics;

Expand Down Expand Up @@ -78,7 +79,7 @@ static Utf16Utility()
long tempUtf8CodeUnitCountAdjustment = 0;
int tempScalarCountAdjustment = 0;

if (Sse2.IsSupported)
if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported)
{
if (inputLength >= Vector128<ushort>.Count)
{
Expand All @@ -87,17 +88,34 @@ static Utf16Utility()
Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
Vector128<ushort> vectorZero = Vector128<ushort>.Zero;

Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

do
{
Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
uint mask;
Vector128<ushort> utf16Data;
if (AdvSimd.Arm64.IsSupported)
{
utf16Data = AdvSimd.LoadVector128((ushort*)pInputBuffer); // unaligned
}
else
{
utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
}

Vector128<ushort> charIsNonAscii;
if (Sse41.IsSupported)

if (AdvSimd.Arm64.IsSupported)
{
// Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
// input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
charIsNonAscii = AdvSimd.Min(utf16Data, vector0080);
}
else if (Sse41.IsSupported)
{
// Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
// input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)

charIsNonAscii = Sse41.Min(utf16Data, vector0080);
}
else
Expand All @@ -111,16 +129,34 @@ static Utf16Utility()

#if DEBUG
// Quick check to ensure we didn't accidentally set the 0x8000 bit of any element.
uint debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
uint debugMask;
if (AdvSimd.Arm64.IsSupported)
{
debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte(), bitMask128);
}
else
{
debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
}
Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'.");
#endif // DEBUG

// Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding
// input was 0x0800 <= [value]. This also handles the missing range a few lines above.

Vector128<ushort> charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
Vector128<ushort> charIsThreeByteUtf8Encoded;
uint mask;

mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
if (AdvSimd.IsSupported)
{
charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11));
mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte(), bitMask128);
}
else
{
charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
}

// Each even bit of mask will be 1 only if the char was >= 0x0080,
// and each odd bit of mask will be 1 only if the char was >= 0x0800.
Expand Down Expand Up @@ -151,9 +187,16 @@ static Utf16Utility()
// Surrogates need to be special-cased for two reasons: (a) we need
// to account for the fact that we over-counted in the addition above;
// and (b) they require separate validation.

utf16Data = Sse2.Add(utf16Data, vectorA800);
mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
if (AdvSimd.Arm64.IsSupported)
{
utf16Data = AdvSimd.Add(utf16Data, vectorA800);
mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte(), bitMask128);
}
else
{
utf16Data = Sse2.Add(utf16Data, vectorA800);
mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
}

if (mask != 0)
{
Expand All @@ -178,7 +221,15 @@ static Utf16Utility()
// Since 'mask' already has 00 in these positions (since the corresponding char
// wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.

uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
uint mask2;
if (AdvSimd.Arm64.IsSupported)
{
mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte(), bitMask128);
}
else
{
mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
}

// 'lowSurrogatesMask' has its bits occur in pairs:
// - 01 if the corresponding char was a low surrogate char,
Expand Down Expand Up @@ -433,5 +484,20 @@ static Utf16Utility()
scalarCountAdjustment = tempScalarCountAdjustment;
return pInputBuffer;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bitMask128)
{
Debug.Assert(AdvSimd.Arm64.IsSupported);

Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128);

// self-pairwise add until all flags have moved to the first two bytes of the vector
extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
return extractedBits.AsUInt16().ToScalar();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;

#if SYSTEM_PRIVATE_CORELIB
Expand Down Expand Up @@ -882,7 +883,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
// is not enabled.

Unsafe.SkipInit(out Vector128<short> nonAsciiUtf16DataMask);
if (Sse41.X64.IsSupported)
if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
{
nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char
}
Expand Down Expand Up @@ -940,10 +941,8 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);

if (Sse41.X64.IsSupported)
if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
{
Debug.Assert(BitConverter.IsLittleEndian, "SSE41 requires little-endian.");

// Try reading and writing 8 elements per iteration.
uint maxIters = minElementsRemaining / 8;
ulong possibleNonAsciiQWord;
Expand All @@ -952,14 +951,30 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
for (i = 0; (uint)i < maxIters; i++)
{
utf16Data = Unsafe.ReadUnaligned<Vector128<short>>(pInputBuffer);
if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask))

if (AdvSimd.IsSupported)
{
goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
}
Vector128<short> isUtf16DataNonAscii = AdvSimd.CompareTest(utf16Data, nonAsciiUtf16DataMask);
bool hasNonAsciiDataInVector = AdvSimd.Arm64.MinPairwise(isUtf16DataNonAscii, isUtf16DataNonAscii).AsUInt64().ToScalar() != 0;

// narrow and write
if (hasNonAsciiDataInVector)
{
goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
}

Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64());
Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data);
AdvSimd.Store(pOutputBuffer, lower);
}
else
{
if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask))
{
goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
}

// narrow and write
Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64());
}

pInputBuffer += 8;
pOutputBuffer += 8;
Expand All @@ -978,7 +993,16 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
}

utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16();
Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));

if (AdvSimd.IsSupported)
{
Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data);
AdvSimd.StoreSelectedScalar((uint*)pOutputBuffer, lower.AsUInt32(), 0);
}
else
{
Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
}

pInputBuffer += 4;
pOutputBuffer += 4;
Expand All @@ -990,7 +1014,15 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
LoopTerminatedDueToNonAsciiDataInVectorLocal:

outputBytesRemaining -= 8 * i;
possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64());

if (Sse2.X64.IsSupported)
{
possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64());
}
else
{
possibleNonAsciiQWord = utf16Data.AsUInt64().ToScalar();
}

// Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector,
// then check whether it's all-ASCII. If so, narrow and write to the destination
Expand All @@ -1000,7 +1032,15 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt

if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII
{
Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
if (AdvSimd.IsSupported)
{
Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data);
AdvSimd.StoreSelectedScalar((uint*)pOutputBuffer, lower.AsUInt32(), 0);
}
else
{
Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
}
pInputBuffer += 4;
pOutputBuffer += 4;
outputBytesRemaining -= 4;
Expand Down
Loading

0 comments on commit 191485c

Please sign in to comment.