-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize BitOperations uses in CoreLib #35650
Changes from all commits
4fc5b57
7691b61
e0a0cfa
56cfa0b
eaa947b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1742,33 +1742,11 @@ private static int LocateLastFoundByte(Vector<byte> match) | |
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static int LocateFirstFoundByte(ulong match) | ||
{ | ||
if (Bmi1.X64.IsSupported) | ||
{ | ||
return (int)(Bmi1.X64.TrailingZeroCount(match) >> 3); | ||
} | ||
else | ||
{ | ||
// Flag least significant power of two bit | ||
ulong powerOfTwoFlag = match ^ (match - 1); | ||
// Shift all powers of two into the high byte and extract | ||
return (int)((powerOfTwoFlag * XorPowerOfTwoToHighByte) >> 57); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
} | ||
=> BitOperations.TrailingZeroCount(match) >> 3; | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static int LocateLastFoundByte(ulong match) | ||
{ | ||
return 7 - (BitOperations.LeadingZeroCount(match) >> 3); | ||
} | ||
|
||
private const ulong XorPowerOfTwoToHighByte = (0x07ul | | ||
0x06ul << 8 | | ||
0x05ul << 16 | | ||
0x04ul << 24 | | ||
0x03ul << 32 | | ||
0x02ul << 40 | | ||
0x01ul << 48) + 1; | ||
=> BitOperations.Log2(match) >> 3; | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static ushort LoadUShort(ref byte start) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -984,27 +984,7 @@ private static int LocateFirstFoundChar(Vector<ushort> match) | |
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static int LocateFirstFoundChar(ulong match) | ||
{ | ||
// TODO: Arm variants | ||
if (Bmi1.X64.IsSupported) | ||
{ | ||
return (int)(Bmi1.X64.TrailingZeroCount(match) >> 4); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one is still reachable, but the fallback note from above applies. |
||
} | ||
else | ||
{ | ||
unchecked | ||
{ | ||
// Flag least significant power of two bit | ||
ulong powerOfTwoFlag = match ^ (match - 1); | ||
// Shift all powers of two into the high byte and extract | ||
return (int)((powerOfTwoFlag * XorPowerOfTwoToHighChar) >> 49); | ||
} | ||
} | ||
} | ||
|
||
private const ulong XorPowerOfTwoToHighChar = (0x03ul | | ||
0x02ul << 16 | | ||
0x01ul << 32) + 1; | ||
=> BitOperations.TrailingZeroCount(match) >> 4; | ||
|
||
// Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138 | ||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
|
@@ -1029,9 +1009,7 @@ private static int LocateLastFoundChar(Vector<ushort> match) | |
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static int LocateLastFoundChar(ulong match) | ||
{ | ||
return 3 - (BitOperations.LeadingZeroCount(match) >> 4); | ||
} | ||
=> BitOperations.Log2(match) >> 4; | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static Vector<ushort> LoadVector(ref char start, nint offset) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,44 +44,19 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat | |
{ | ||
Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value."); | ||
|
||
// Use BMI1 directly rather than going through BitOperations. We only see a perf gain here | ||
// if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations | ||
// is too slow for our purposes since we can provide our own faster, specialized software fallback. | ||
|
||
if (Bmi1.IsSupported) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to preserve the intent of this one, which was that hardware accelerated There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the heads-up! :) |
||
{ | ||
Debug.Assert(BitConverter.IsLittleEndian); | ||
return Bmi1.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; | ||
} | ||
|
||
// Couldn't emit tzcnt, use specialized software fallback. | ||
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending | ||
// on whether all processed bytes were ASCII. Then we accumulate all of the | ||
// results to calculate how many consecutive ASCII bytes are present. | ||
|
||
value = ~value; | ||
|
||
if (BitConverter.IsLittleEndian) | ||
{ | ||
// Read first byte | ||
value >>= 7; | ||
uint allBytesUpToNowAreAscii = value & 1; | ||
uint numAsciiBytes = allBytesUpToNowAreAscii; | ||
|
||
// Read second byte | ||
value >>= 8; | ||
allBytesUpToNowAreAscii &= value; | ||
numAsciiBytes += allBytesUpToNowAreAscii; | ||
|
||
// Read third byte | ||
value >>= 8; | ||
allBytesUpToNowAreAscii &= value; | ||
numAsciiBytes += allBytesUpToNowAreAscii; | ||
|
||
return numAsciiBytes; | ||
return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; | ||
} | ||
else | ||
{ | ||
// Couldn't use tzcnt, use specialized software fallback. | ||
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending | ||
// on whether all processed bytes were ASCII. Then we accumulate all of the | ||
// results to calculate how many consecutive ASCII bytes are present. | ||
|
||
value = ~value; | ||
|
||
// BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on | ||
// little-endian platforms, so using it in this big-endian path would be too | ||
// expensive. Instead we'll just change how we perform the shifts. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -410,18 +410,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin | |
|
||
if ((bufferLength & 8) != 0) | ||
{ | ||
if (Bmi1.X64.IsSupported) | ||
if (UIntPtr.Size == sizeof(ulong)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, this path should be faster on all supported 64-bit platforms, but I wasn't sure how that's best expressed here. Should this use conditional compilation, or an explicit |
||
{ | ||
// If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it. | ||
|
||
ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer); | ||
if (!AllBytesInUInt64AreAscii(candidateUInt64)) | ||
{ | ||
// Clear everything but the high bit of each byte, then tzcnt. | ||
// Remember the / 8 at the end to convert bit count to byte count. | ||
// Remember to divide by 8 at the end to convert bit count to byte count. | ||
|
||
candidateUInt64 &= UInt64HighBitsOnlyMask; | ||
pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8); | ||
pBuffer += (nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3); | ||
goto Finish; | ||
} | ||
} | ||
|
@@ -932,20 +932,20 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin | |
|
||
if ((bufferLength & 4) != 0) | ||
{ | ||
if (Bmi1.X64.IsSupported) | ||
if (UIntPtr.Size == sizeof(ulong)) | ||
{ | ||
// If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it. | ||
|
||
ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer); | ||
if (!AllCharsInUInt64AreAscii(candidateUInt64)) | ||
{ | ||
// Clear the low 7 bits (the ASCII bits) of each char, then tzcnt. | ||
// Remember the / 8 at the end to convert bit count to byte count, | ||
// Remember to divide by 8 at the end to convert bit count to byte count, | ||
// then the & ~1 at the end to treat a match in the high byte of | ||
// any char the same as a match in the low byte of that same char. | ||
|
||
candidateUInt64 &= 0xFF80FF80_FF80FF80ul; | ||
pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1)); | ||
pBuffer = (char*)((byte*)pBuffer + ((nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3) & ~(nuint)1)); | ||
goto Finish; | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code was dead as of #32371, since it was used only on the
Vector<byte>
fallback for when SSE2 is not available. Since SSE2 is required for BMI1, this was unreachable.