Skip to content

Commit

Permalink
Optimize BitOperations uses in CoreLib (#35650)
Browse files Browse the repository at this point in the history
* make Log2 branch-free

* convert LeadingZeroCount uses to Log2

* switch Bmi1.TrailingZeroCount calls to BitOperations
  • Loading branch information
saucecontrol authored May 23, 2020
1 parent 70ac9ae commit ba728d9
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 108 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ public static int CountDigits(uint value)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int CountHexDigits(ulong value)
{
return (64 - BitOperations.LeadingZeroCount(value | 1) + 3) >> 2;
// The number of hex digits is log16(value) + 1, or log2(value) / 4 + 1
return (BitOperations.Log2(value) >> 2) + 1;
}

// Counts the number of trailing '0' digits in a decimal number.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@ internal static class Utilities
internal static int SelectBucketIndex(int bufferSize)
{
Debug.Assert(bufferSize >= 0);
uint bits = ((uint)bufferSize - 1) >> 4;
return 32 - BitOperations.LeadingZeroCount(bits);

// Buffers are bucketed so that a request between 2^(n-1) + 1 and 2^n is given a buffer of 2^n
// Bucket index is log2(bufferSize - 1) with the exception that buffers between 1 and 16 bytes
// are combined, and the index is slid down by 3 to compensate.
// Zero is a valid bufferSize, and it is assigned the highest bucket index so that zero-length
// buffers are not retained by the pool. The pool will return the Array.Empty singleton for these.
return BitOperations.Log2((uint)bufferSize - 1 | 15) - 3;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,18 @@ public static int LeadingZeroCount(ulong value)

/// <summary>
/// Returns the integer (floor) log of the specified value, base 2.
/// Note that by convention, input value 0 returns 0 since Log(0) is undefined.
/// Note that by convention, input value 0 returns 0 since log(0) is undefined.
/// </summary>
/// <param name="value">The value.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static int Log2(uint value)
{
// Enforce conventional contract 0->0 (Log(0) is undefined)
if (value == 0)
{
return 0;
}
// The 0->0 contract is fulfilled by setting the LSB to 1.
// Log(1) is 0, and setting the LSB for values > 1 does not change the log2 result.
value |= 1;

// value lzcnt actual expected
// ..0000 32 0 0 (by convention, guard clause)
// ..0001 31 31-31 0
// ..0010 30 31-30 1
// 0010.. 2 31-2 29
Expand All @@ -153,8 +150,8 @@ public static int Log2(uint value)
return 31 ^ ArmBase.LeadingZeroCount(value);
}

// BSR returns the answer we're looking for directly.
// However BSR is much slower than LZCNT on AMD processors, so we leave it as a fallback only.
// BSR returns the log2 result directly. However BSR is slower than LZCNT
// on AMD processors, so we leave it as a fallback only.
if (X86Base.IsSupported)
{
return (int)X86Base.BitScanReverse(value);
Expand All @@ -166,18 +163,14 @@ public static int Log2(uint value)

/// <summary>
/// Returns the integer (floor) log of the specified value, base 2.
/// Note that by convention, input value 0 returns 0 since Log(0) is undefined.
/// Note that by convention, input value 0 returns 0 since log(0) is undefined.
/// </summary>
/// <param name="value">The value.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static int Log2(ulong value)
{
// Enforce conventional contract 0->0 (Log(0) is undefined)
if (value == 0)
{
return 0;
}
value |= 1;

if (Lzcnt.X64.IsSupported)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1733,33 +1733,11 @@ private static int LocateLastFoundByte(Vector<byte> match)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundByte(ulong match)
{
if (Bmi1.X64.IsSupported)
{
return (int)(Bmi1.X64.TrailingZeroCount(match) >> 3);
}
else
{
// Flag least significant power of two bit
ulong powerOfTwoFlag = match ^ (match - 1);
// Shift all powers of two into the high byte and extract
return (int)((powerOfTwoFlag * XorPowerOfTwoToHighByte) >> 57);
}
}
=> BitOperations.TrailingZeroCount(match) >> 3;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateLastFoundByte(ulong match)
{
return 7 - (BitOperations.LeadingZeroCount(match) >> 3);
}

private const ulong XorPowerOfTwoToHighByte = (0x07ul |
0x06ul << 8 |
0x05ul << 16 |
0x04ul << 24 |
0x03ul << 32 |
0x02ul << 40 |
0x01ul << 48) + 1;
=> BitOperations.Log2(match) >> 3;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ushort LoadUShort(ref byte start)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -975,27 +975,7 @@ private static int LocateFirstFoundChar(Vector<ushort> match)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundChar(ulong match)
{
// TODO: Arm variants
if (Bmi1.X64.IsSupported)
{
return (int)(Bmi1.X64.TrailingZeroCount(match) >> 4);
}
else
{
unchecked
{
// Flag least significant power of two bit
ulong powerOfTwoFlag = match ^ (match - 1);
// Shift all powers of two into the high byte and extract
return (int)((powerOfTwoFlag * XorPowerOfTwoToHighChar) >> 49);
}
}
}

private const ulong XorPowerOfTwoToHighChar = (0x03ul |
0x02ul << 16 |
0x01ul << 32) + 1;
=> BitOperations.TrailingZeroCount(match) >> 4;

// Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138
[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand All @@ -1020,9 +1000,7 @@ private static int LocateLastFoundChar(Vector<ushort> match)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateLastFoundChar(ulong match)
{
return 3 - (BitOperations.LeadingZeroCount(match) >> 4);
}
=> BitOperations.Log2(match) >> 4;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<ushort> LoadVector(ref char start, nint offset)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,44 +44,19 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat
{
Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value.");

// Use BMI1 directly rather than going through BitOperations. We only see a perf gain here
// if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations
// is too slow for our purposes since we can provide our own faster, specialized software fallback.

if (Bmi1.IsSupported)
{
Debug.Assert(BitConverter.IsLittleEndian);
return Bmi1.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
}

// Couldn't emit tzcnt, use specialized software fallback.
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
// on whether all processed bytes were ASCII. Then we accumulate all of the
// results to calculate how many consecutive ASCII bytes are present.

value = ~value;

if (BitConverter.IsLittleEndian)
{
// Read first byte
value >>= 7;
uint allBytesUpToNowAreAscii = value & 1;
uint numAsciiBytes = allBytesUpToNowAreAscii;

// Read second byte
value >>= 8;
allBytesUpToNowAreAscii &= value;
numAsciiBytes += allBytesUpToNowAreAscii;

// Read third byte
value >>= 8;
allBytesUpToNowAreAscii &= value;
numAsciiBytes += allBytesUpToNowAreAscii;

return numAsciiBytes;
return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
}
else
{
// Couldn't use tzcnt, use specialized software fallback.
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
// on whether all processed bytes were ASCII. Then we accumulate all of the
// results to calculate how many consecutive ASCII bytes are present.

value = ~value;

// BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on
// little-endian platforms, so using it in this big-endian path would be too
// expensive. Instead we'll just change how we perform the shifts.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -388,18 +388,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin

if ((bufferLength & 8) != 0)
{
if (Bmi1.X64.IsSupported)
if (UIntPtr.Size == sizeof(ulong))
{
// If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.

ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
if (!AllBytesInUInt64AreAscii(candidateUInt64))
{
// Clear everything but the high bit of each byte, then tzcnt.
// Remember the / 8 at the end to convert bit count to byte count.
// Remember to divide by 8 at the end to convert bit count to byte count.

candidateUInt64 &= UInt64HighBitsOnlyMask;
pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
pBuffer += (nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3);
goto Finish;
}
}
Expand Down Expand Up @@ -914,20 +914,20 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin

if ((bufferLength & 4) != 0)
{
if (Bmi1.X64.IsSupported)
if (UIntPtr.Size == sizeof(ulong))
{
// If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.

ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
if (!AllCharsInUInt64AreAscii(candidateUInt64))
{
// Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
// Remember the / 8 at the end to convert bit count to byte count,
// Remember to divide by 8 at the end to convert bit count to byte count,
// then the & ~1 at the end to treat a match in the high byte of
// any char the same as a match in the low byte of that same char.

candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
pBuffer = (char*)((byte*)pBuffer + ((nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3) & ~(nuint)1));
goto Finish;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ internal static unsafe partial class Utf8Utility

do
{
if (Sse2.IsSupported && Bmi1.IsSupported)
if (Sse2.IsSupported)
{
// pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
// going to perform an unaligned load. We don't necessarily care about aligning
Expand Down Expand Up @@ -158,7 +158,6 @@ internal static unsafe partial class Utf8Utility

Debug.Assert(BitConverter.IsLittleEndian);
Debug.Assert(Sse2.IsSupported);
Debug.Assert(Bmi1.IsSupported);

// The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
// for each non-ASCII byte we saw. We can count the number of ASCII bytes,
Expand All @@ -167,7 +166,7 @@ internal static unsafe partial class Utf8Utility

Debug.Assert(mask != 0);

pInputBuffer += Bmi1.TrailingZeroCount(mask);
pInputBuffer += BitOperations.TrailingZeroCount(mask);
if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
{
goto ProcessRemainingBytesSlow;
Expand Down

0 comments on commit ba728d9

Please sign in to comment.