Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port SpanHelpers.SequenceCompareTo(ref byte, int, ref byte, int) to Vector128/256 #73475

Merged
merged 7 commits into from
Oct 10, 2022
46 changes: 10 additions & 36 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1918,15 +1918,15 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations
nuint lengthToExamine = minLength;

if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
if (lengthToExamine >= (nuint)Vector256<byte>.Count)
{
lengthToExamine -= (nuint)Vector256<byte>.Count;
uint matches;
while (lengthToExamine > offset)
{
matches = (uint)Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset)));
matches = Vector256.Equals(Vector256.LoadUnsafe(ref first, offset), Vector256.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.

Expand All @@ -1943,7 +1943,7 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
// Move to Vector length from end for final compare
offset = lengthToExamine;
// Same as method as above
matches = (uint)Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset)));
matches = Vector256.Equals(Vector256.LoadUnsafe(ref first, offset), Vector256.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
if (matches == uint.MaxValue)
{
// All matched
Expand All @@ -1967,7 +1967,7 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
uint matches;
if (lengthToExamine > offset)
{
matches = (uint)Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
matches = Vector128.Equals(Vector128.LoadUnsafe(ref first, offset), Vector128.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.

Expand All @@ -1980,7 +1980,7 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
// Move to Vector length from end for final compare
offset = lengthToExamine;
// Same as method as above
matches = (uint)Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
matches = Vector128.Equals(Vector128.LoadUnsafe(ref first, offset), Vector128.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
if (matches == ushort.MaxValue)
{
// All matched
Expand All @@ -1998,58 +1998,32 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
return result;
}
}
else if (Sse2.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
if (lengthToExamine >= (nuint)Vector128<byte>.Count)
{
lengthToExamine -= (nuint)Vector128<byte>.Count;
uint matches;
while (lengthToExamine > offset)
{
matches = (uint)Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.

// 16 elements in Vector128<byte> so we compare to ushort.MaxValue to check if everything matched
if (matches == ushort.MaxValue)
if (Vector128.LoadUnsafe(ref first, offset) == Vector128.LoadUnsafe(ref second, offset))
{
// All matched
offset += (nuint)Vector128<byte>.Count;
continue;
}

goto Difference;
goto BytewiseCheck;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this replacing the vectorized detection of which element in the vector differed with a linear walk through all bytes in the vector? If so, did you validate the perf impact of this on inputs smaller than 512 elements? I'm surprised this wouldn't result in regressions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stephentoub please excuse me for the delay. That is true, as Vector128 code path is also executed for arm64, where ExtractMostSignificantBits is expensive. I've used BytewiseCheck which was so far used by Vector<T> code path and its perf is OK.

I've added benchmarks for smaller collection sizes, synced the fork and re-run them.

x64 AVX2 (Vector256)

It's more or less on par.

BenchmarkDotNet=v0.13.2.1937-nightly, OS=Windows 11 (10.0.22000.978/21H2)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=7.0.100-rtm.22506.1
  [Host]     : .NET 7.0.0 (7.0.22.48010), X64 RyuJIT AVX2
        main : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
          pr : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Method Job Toolchain Size Mean Ratio
SequenceCompareTo main \main\corerun.exe 8 7.522 ns 1.00
SequenceCompareTo pr \prSync\corerun.exe 8 7.461 ns 0.99
SequenceCompareToDifferent main \main\corerun.exe 8 4.344 ns 1.00
SequenceCompareToDifferent pr \prSync\corerun.exe 8 4.230 ns 0.97
SequenceCompareTo main \main\corerun.exe 32 3.889 ns 1.00
SequenceCompareTo pr \prSync\corerun.exe 32 3.869 ns 0.99
SequenceCompareToDifferent main \main\corerun.exe 32 4.651 ns 1.00
SequenceCompareToDifferent pr \prSync\corerun.exe 32 4.603 ns 0.99
SequenceCompareTo main \main\corerun.exe 64 4.426 ns 1.00
SequenceCompareTo pr \prSync\corerun.exe 64 4.446 ns 1.00
SequenceCompareToDifferent main \main\corerun.exe 64 4.314 ns 1.00
SequenceCompareToDifferent pr \prSync\corerun.exe 64 4.332 ns 1.00
SequenceCompareTo main \main\corerun.exe 128 5.506 ns 1.00
SequenceCompareTo pr \prSync\corerun.exe 128 5.558 ns 1.01
SequenceCompareToDifferent main \main\corerun.exe 128 4.337 ns 1.00
SequenceCompareToDifferent pr \prSync\corerun.exe 128 4.321 ns 1.00
SequenceCompareTo main \main\corerun.exe 512 11.758 ns 1.00
SequenceCompareTo pr \prSync\corerun.exe 512 11.694 ns 0.99
SequenceCompareToDifferent main \main\corerun.exe 512 4.324 ns 1.00
SequenceCompareToDifferent pr \prSync\corerun.exe 512 4.321 ns 1.00

x64 AVX (Vector128)

It's 2-3% slower, but it translates to just +- 0.2ns.

BenchmarkDotNet=v0.13.2.1937-nightly, OS=Windows 11 (10.0.22000.978/21H2)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=7.0.100-rtm.22506.1
  [Host]     : .NET 7.0.0 (7.0.22.48010), X64 RyuJIT AVX2
        main : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX
          pr : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX

EnvironmentVariables=COMPlus_EnableAVX2=0
Method Job Size Mean Ratio
SequenceCompareTo main 8 7.058 ns 1.00
SequenceCompareTo pr 8 6.696 ns 0.95
SequenceCompareToDifferent main 8 4.088 ns 1.00
SequenceCompareToDifferent pr 8 4.099 ns 1.00
SequenceCompareTo main 32 4.343 ns 1.00
SequenceCompareTo pr 32 4.273 ns 0.98
SequenceCompareToDifferent main 32 4.323 ns 1.00
SequenceCompareToDifferent pr 32 4.414 ns 1.02
SequenceCompareTo main 64 5.322 ns 1.00
SequenceCompareTo pr 64 5.383 ns 1.01
SequenceCompareToDifferent main 64 4.316 ns 1.00
SequenceCompareToDifferent pr 64 4.444 ns 1.03
SequenceCompareTo main 128 6.987 ns 1.00
SequenceCompareTo pr 128 7.229 ns 1.03
SequenceCompareToDifferent main 128 4.307 ns 1.00
SequenceCompareToDifferent pr 128 4.493 ns 1.04
SequenceCompareTo main 512 18.724 ns 1.00
SequenceCompareTo pr 512 19.032 ns 1.02
SequenceCompareToDifferent main 512 4.302 ns 1.00
SequenceCompareToDifferent pr 512 4.451 ns 1.03

Arm64 AdvSimd (Vector128)

For cases where the inputs are different the perf remains the same, but we can observe a nice boost for equal imputs >= 8 elements (20% to even x3)

BenchmarkDotNet=v0.13.2.1937-nightly, OS=ubuntu 20.04
Unknown processor
.NET SDK=7.0.100-rtm.22506.1
  [Host]     : .NET 7.0.0 (7.0.22.48010), Arm64 RyuJIT AdvSIMD
        main : .NET 8.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
          pr : .NET 8.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD

LaunchCount=9 MemoryRandomization=True
Method Job Size Median
SequenceCompareTo main 8 6.551 ns
SequenceCompareTo pr 8 6.547 ns
SequenceCompareToDifferent main 8 2.696 ns
SequenceCompareToDifferent pr 8 2.696 ns
SequenceCompareTo main 32 12.399 ns
SequenceCompareTo pr 32 3.851 ns
SequenceCompareToDifferent main 32 2.697 ns
SequenceCompareToDifferent pr 32 2.696 ns
SequenceCompareTo main 64 15.863 ns
SequenceCompareTo pr 64 5.776 ns
SequenceCompareToDifferent main 64 4.090 ns
SequenceCompareToDifferent pr 64 3.274 ns
SequenceCompareTo main 128 23.466 ns
SequenceCompareTo pr 128 12.219 ns
SequenceCompareToDifferent main 128 2.696 ns
SequenceCompareToDifferent pr 128 2.696 ns
SequenceCompareTo main 512 59.817 ns
SequenceCompareTo pr 512 48.528 ns
SequenceCompareToDifferent main 512 2.698 ns
SequenceCompareToDifferent pr 512 2.696 ns

}
// Move to Vector length from end for final compare
offset = lengthToExamine;
// Same as method as above
matches = (uint)Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == ushort.MaxValue)
if (Vector128.LoadUnsafe(ref first, offset) == Vector128.LoadUnsafe(ref second, offset))
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
uint differences = ~matches;
// Find bitflag offset of first difference and add to current offset
offset += (uint)BitOperations.TrailingZeroCount(differences);

int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);

return result;
goto BytewiseCheck;
}
}
//else if (AdvSimd.Arm64.IsSupported)
//{
// // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
// // when compared to the vectorized implementation below. There were some wins if the mismatch happen
// // after 8th index of the chunk because with ARM64 intrinsic, using fewer instructions the first mismatched
// // index can be retrieved. In case of vectorization, sequential scan has to be done instead. However, at the
// // same time, there are losses if the mismatch index is less than 7~8. So the overall benefit doesn't justify
// // to optimize this method with ARM64 hardware intrinsics.
//}
else if (Vector.IsHardwareAccelerated)
{
if (lengthToExamine > (nuint)Vector<byte>.Count)
Expand Down