Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize BitOperations uses in CoreLib #35650

Merged
merged 5 commits into from
May 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ public static int CountDigits(uint value)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int CountHexDigits(ulong value)
{
return (64 - BitOperations.LeadingZeroCount(value | 1) + 3) >> 2;
// The number of hex digits is log16(value) + 1, or log2(value) / 4 + 1
return (BitOperations.Log2(value) >> 2) + 1;
}

// Counts the number of trailing '0' digits in a decimal number.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@ internal static class Utilities
internal static int SelectBucketIndex(int bufferSize)
{
Debug.Assert(bufferSize >= 0);
uint bits = ((uint)bufferSize - 1) >> 4;
return 32 - BitOperations.LeadingZeroCount(bits);

// Buffers are bucketed so that a request between 2^(n-1) + 1 and 2^n is given a buffer of 2^n
// Bucket index is log2(bufferSize - 1) with the exception that buffers between 1 and 16 bytes
// are combined, and the index is slid down by 3 to compensate.
// Zero is a valid bufferSize, and it is assigned the highest bucket index so that zero-length
// buffers are not retained by the pool. The pool will return the Array.Empty singleton for these.
return BitOperations.Log2((uint)bufferSize - 1 | 15) - 3;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,18 @@ public static int LeadingZeroCount(ulong value)

/// <summary>
/// Returns the integer (floor) log of the specified value, base 2.
/// Note that by convention, input value 0 returns 0 since Log(0) is undefined.
/// Note that by convention, input value 0 returns 0 since log(0) is undefined.
/// </summary>
/// <param name="value">The value.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static int Log2(uint value)
{
// Enforce conventional contract 0->0 (Log(0) is undefined)
if (value == 0)
{
return 0;
}
// The 0->0 contract is fulfilled by setting the LSB to 1.
// Log(1) is 0, and setting the LSB for values > 1 does not change the log2 result.
value |= 1;

// value lzcnt actual expected
// ..0000 32 0 0 (by convention, guard clause)
// ..0001 31 31-31 0
// ..0010 30 31-30 1
// 0010.. 2 31-2 29
Expand All @@ -153,8 +150,8 @@ public static int Log2(uint value)
return 31 ^ ArmBase.LeadingZeroCount(value);
}

// BSR returns the answer we're looking for directly.
// However BSR is much slower than LZCNT on AMD processors, so we leave it as a fallback only.
// BSR returns the log2 result directly. However BSR is slower than LZCNT
// on AMD processors, so we leave it as a fallback only.
if (X86Base.IsSupported)
{
return (int)X86Base.BitScanReverse(value);
Expand All @@ -166,18 +163,14 @@ public static int Log2(uint value)

/// <summary>
/// Returns the integer (floor) log of the specified value, base 2.
/// Note that by convention, input value 0 returns 0 since Log(0) is undefined.
/// Note that by convention, input value 0 returns 0 since log(0) is undefined.
/// </summary>
/// <param name="value">The value.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static int Log2(ulong value)
{
// Enforce conventional contract 0->0 (Log(0) is undefined)
if (value == 0)
{
return 0;
}
value |= 1;

if (Lzcnt.X64.IsSupported)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1742,33 +1742,11 @@ private static int LocateLastFoundByte(Vector<byte> match)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundByte(ulong match)
{
if (Bmi1.X64.IsSupported)
{
return (int)(Bmi1.X64.TrailingZeroCount(match) >> 3);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code was dead as of #32371, since it was used only on the Vector<byte> fallback for when SSE2 is not available. Since SSE2 is required for BMI1, this was unreachable.

}
else
{
// Flag least significant power of two bit
ulong powerOfTwoFlag = match ^ (match - 1);
// Shift all powers of two into the high byte and extract
return (int)((powerOfTwoFlag * XorPowerOfTwoToHighByte) >> 57);
Copy link
Member Author

@saucecontrol saucecontrol Apr 30, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BitOperations.TrailingZeroCount(ulong) is now hardware accelerated on all x64 and ARM64. On x86, the value is decomposed and accelerated as a 32-bit call, leaving ARM32 as the only candidate for this software fallback. Since it's doing 64-bit math, the decomposed LUT fallback in BitOperations is probably cheaper.

}
}
=> BitOperations.TrailingZeroCount(match) >> 3;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateLastFoundByte(ulong match)
{
return 7 - (BitOperations.LeadingZeroCount(match) >> 3);
}

private const ulong XorPowerOfTwoToHighByte = (0x07ul |
0x06ul << 8 |
0x05ul << 16 |
0x04ul << 24 |
0x03ul << 32 |
0x02ul << 40 |
0x01ul << 48) + 1;
=> BitOperations.Log2(match) >> 3;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ushort LoadUShort(ref byte start)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -984,27 +984,7 @@ private static int LocateFirstFoundChar(Vector<ushort> match)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundChar(ulong match)
{
// TODO: Arm variants
if (Bmi1.X64.IsSupported)
{
return (int)(Bmi1.X64.TrailingZeroCount(match) >> 4);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is still reachable, but the fallback note from above applies.

}
else
{
unchecked
{
// Flag least significant power of two bit
ulong powerOfTwoFlag = match ^ (match - 1);
// Shift all powers of two into the high byte and extract
return (int)((powerOfTwoFlag * XorPowerOfTwoToHighChar) >> 49);
}
}
}

private const ulong XorPowerOfTwoToHighChar = (0x03ul |
0x02ul << 16 |
0x01ul << 32) + 1;
=> BitOperations.TrailingZeroCount(match) >> 4;

// Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138
[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand All @@ -1029,9 +1009,7 @@ private static int LocateLastFoundChar(Vector<ushort> match)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateLastFoundChar(ulong match)
{
return 3 - (BitOperations.LeadingZeroCount(match) >> 4);
}
=> BitOperations.Log2(match) >> 4;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<ushort> LoadVector(ref char start, nint offset)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,44 +44,19 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat
{
Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value.");

// Use BMI1 directly rather than going through BitOperations. We only see a perf gain here
// if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations
// is too slow for our purposes since we can provide our own faster, specialized software fallback.

if (Bmi1.IsSupported)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to preserve the intent of this one, which was that hardware accelerated tzcnt should be used on little-endian. I can't think of any LE case that wouldn't be better off using BitOperations now, but I might have missed something.
cc @GrabYourPitchforks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the heads-up! :)
I'm not familiar enough with the larger change but this part LGTM.

{
Debug.Assert(BitConverter.IsLittleEndian);
return Bmi1.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
}

// Couldn't emit tzcnt, use specialized software fallback.
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
// on whether all processed bytes were ASCII. Then we accumulate all of the
// results to calculate how many consecutive ASCII bytes are present.

value = ~value;

if (BitConverter.IsLittleEndian)
{
// Read first byte
value >>= 7;
uint allBytesUpToNowAreAscii = value & 1;
uint numAsciiBytes = allBytesUpToNowAreAscii;

// Read second byte
value >>= 8;
allBytesUpToNowAreAscii &= value;
numAsciiBytes += allBytesUpToNowAreAscii;

// Read third byte
value >>= 8;
allBytesUpToNowAreAscii &= value;
numAsciiBytes += allBytesUpToNowAreAscii;

return numAsciiBytes;
return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
}
else
{
// Couldn't use tzcnt, use specialized software fallback.
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
// on whether all processed bytes were ASCII. Then we accumulate all of the
// results to calculate how many consecutive ASCII bytes are present.

value = ~value;

// BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on
// little-endian platforms, so using it in this big-endian path would be too
// expensive. Instead we'll just change how we perform the shifts.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,18 +410,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin

if ((bufferLength & 8) != 0)
{
if (Bmi1.X64.IsSupported)
if (UIntPtr.Size == sizeof(ulong))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, this path should be faster on all supported 64-bit platforms, but I wasn't sure how that's best expressed here. Should this use conditional compilation, or an explicit if (X86Base.X64.IsSupported || ArmBase.Arm64.IsSupported)?

{
// If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.

ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
if (!AllBytesInUInt64AreAscii(candidateUInt64))
{
// Clear everything but the high bit of each byte, then tzcnt.
// Remember the / 8 at the end to convert bit count to byte count.
// Remember to divide by 8 at the end to convert bit count to byte count.

candidateUInt64 &= UInt64HighBitsOnlyMask;
pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
pBuffer += (nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3);
goto Finish;
}
}
Expand Down Expand Up @@ -932,20 +932,20 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin

if ((bufferLength & 4) != 0)
{
if (Bmi1.X64.IsSupported)
if (UIntPtr.Size == sizeof(ulong))
{
// If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.

ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
if (!AllCharsInUInt64AreAscii(candidateUInt64))
{
// Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
// Remember the / 8 at the end to convert bit count to byte count,
// Remember to divide by 8 at the end to convert bit count to byte count,
// then the & ~1 at the end to treat a match in the high byte of
// any char the same as a match in the low byte of that same char.

candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
pBuffer = (char*)((byte*)pBuffer + ((nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3) & ~(nuint)1));
goto Finish;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ private static void _ValidateAdditionalNIntDefinitions()

do
{
if (Sse2.IsSupported && Bmi1.IsSupported)
if (Sse2.IsSupported)
{
// pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
// going to perform an unaligned load. We don't necessarily care about aligning
Expand Down Expand Up @@ -180,7 +180,6 @@ private static void _ValidateAdditionalNIntDefinitions()

Debug.Assert(BitConverter.IsLittleEndian);
Debug.Assert(Sse2.IsSupported);
Debug.Assert(Bmi1.IsSupported);

// The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
// for each non-ASCII byte we saw. We can count the number of ASCII bytes,
Expand All @@ -189,7 +188,7 @@ private static void _ValidateAdditionalNIntDefinitions()

Debug.Assert(mask != 0);

pInputBuffer += Bmi1.TrailingZeroCount(mask);
pInputBuffer += BitOperations.TrailingZeroCount(mask);
if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
{
goto ProcessRemainingBytesSlow;
Expand Down