From 1033297a37519b56729b7a5ba54259ba1fcb4de4 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 18:19:31 +0100 Subject: [PATCH 1/8] Add initial FMA resize kernel convolve implementation --- .../Transforms/Resize/ResizeKernel.cs | 58 +++++++++++++++---- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index d94aeffe69..bff2c574a6 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -4,6 +4,10 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Processing.Processors.Transforms { @@ -66,21 +70,55 @@ public Vector4 Convolve(Span rowSpan) [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - ref float horizontalValues = ref Unsafe.AsRef(this.bufferPtr); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Fma.IsSupported) + { + float* bufferStart = this.bufferPtr; + float* bufferEnd = bufferStart + (this.Length & ~1); + Vector256 result256 = Vector256.Zero; - // Destination color components - Vector4 result = Vector4.Zero; + while (bufferStart < bufferEnd) + { + Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); + var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1])); - for (int i = 0; i < this.Length; i++) - { - float weight = Unsafe.Add(ref horizontalValues, i); + result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256); + + bufferStart += 2; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + } + + Vector128 result128 = Sse.Add(result256.GetLower(), result256.GetUpper()); + + if ((this.Length & 1) != 0) + { + Vector128 rowItem128 = Unsafe.As>(ref rowStartRef); + var bufferItem128 = Vector128.Create(*bufferStart); - // Vector4 v = offsetedRowSpan[i]; - Vector4 v = Unsafe.Add(ref rowStartRef, i); - result += v * weight; + result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128); + } + + return *(Vector4*)&result128; } + else +#endif + { + // Destination color components + Vector4 result = Vector4.Zero; + float* bufferStart = this.bufferPtr; + float* bufferEnd = this.bufferPtr + this.Length; + + while (bufferStart < bufferEnd) + { + // Vector4 v = offsetedRowSpan[i]; + result += rowStartRef * *bufferStart; - return result; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); + bufferStart++; + } + + return result; + } } /// From c825eccd10f14eb733cdbe4c75656005afae5aed Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 19:14:51 +0100 Subject: [PATCH 2/8] Improved loading of factors using permutation Assembly for loading in the loop went from: ```asm vmovss xmm2, [rax] vbroadcastss xmm2, xmm2 vmovss xmm3, [rax+4] vbroadcastss xmm3, xmm3 vinsertf128 ymm2, ymm2, xmm3, 1 ``` To: ```asm vmovsd xmm3, [rax] vbroadcastsd ymm3, xmm3 vpermps ymm3, ymm1, ymm3 ``` --- .../Processing/Processors/Transforms/Resize/ResizeKernel.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index bff2c574a6..02027f42d8 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -76,11 +76,12 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) float* bufferStart = this.bufferPtr; float* bufferEnd = bufferStart + (this.Length & ~1); Vector256 result256 = Vector256.Zero; + var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1); while (bufferStart < bufferEnd) { Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); - var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1])); + Vector256 bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask); result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256); From 1169e73915d98590e82d64f72fa3c2197e00aea9 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 19:33:15 +0100 Subject: [PATCH 3/8] Switch from FMA to AVX2 instructions --- .../Processors/Transforms/Resize/ResizeKernel.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 02027f42d8..5a87d045ea 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -71,7 +71,7 @@ public Vector4 Convolve(Span rowSpan) public Vector4 ConvolveCore(ref Vector4 rowStartRef) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Fma.IsSupported) + if (Avx2.IsSupported) { float* bufferStart = this.bufferPtr; float* bufferEnd = bufferStart + (this.Length & ~1); @@ -82,8 +82,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) { Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); Vector256 bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask); + Vector256 multiply256 = Avx.Multiply(rowItem256, bufferItem256); - result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256); + result256 = Avx.Add(multiply256, result256); bufferStart += 2; rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); @@ -95,8 +96,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) { Vector128 rowItem128 = Unsafe.As>(ref rowStartRef); var bufferItem128 = Vector128.Create(*bufferStart); + Vector128 multiply128 = Sse.Multiply(rowItem128, bufferItem128); - result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128); + result128 = Sse.Add(multiply128, result128); } return *(Vector4*)&result128; @@ -114,8 +116,8 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) // Vector4 v = offsetedRowSpan[i]; result += rowStartRef * *bufferStart; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); bufferStart++; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); } return result; From 0e465cd8c30713b1c3c91966ebef855d4eda314d Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 22:58:43 +0100 Subject: [PATCH 4/8] Revert to FMA, codegen improvements --- .../Transforms/Resize/ResizeKernel.cs | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 5a87d045ea..bd22864bb2 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -71,7 +71,7 @@ public Vector4 Convolve(Span rowSpan) public Vector4 ConvolveCore(ref Vector4 rowStartRef) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) + if (Fma.IsSupported) { float* bufferStart = this.bufferPtr; float* bufferEnd = bufferStart + (this.Length & ~1); @@ -80,11 +80,20 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) while (bufferStart < bufferEnd) { - Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); - Vector256 bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask); - Vector256 multiply256 = Avx.Multiply(rowItem256, bufferItem256); - - result256 = Avx.Add(multiply256, result256); + // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps + // for the FMA operation, and execute it directly on the target register and reading directly from + // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. + // The code below should compile in the following assembly on .NET 5 x64: + // + // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] + // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] + // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256 + // + // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. + result256 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256); bufferStart += 2; rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); @@ -94,11 +103,10 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) if ((this.Length & 1) != 0) { - Vector128 rowItem128 = Unsafe.As>(ref rowStartRef); - var bufferItem128 = Vector128.Create(*bufferStart); - Vector128 multiply128 = Sse.Multiply(rowItem128, bufferItem128); - - result128 = Sse.Add(multiply128, result128); + result128 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Vector128.Create(*bufferStart), + result128); } return *(Vector4*)&result128; From e0b2defde22343414ee70babe21d1209fb760cbe Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 23:25:47 +0100 Subject: [PATCH 5/8] Add unrolled FMA loop --- .../Transforms/Resize/ResizeKernel.cs | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index bd22864bb2..b537cdfdf9 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -74,8 +74,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) if (Fma.IsSupported) { float* bufferStart = this.bufferPtr; - float* bufferEnd = bufferStart + (this.Length & ~1); - Vector256 result256 = Vector256.Zero; + float* bufferEnd = bufferStart + (this.Length & ~3); + Vector256 result256_0 = Vector256.Zero; + Vector256 result256_1 = Vector256.Zero; var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1); while (bufferStart < bufferEnd) @@ -87,19 +88,36 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) // // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] - // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256 + // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 // // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. - result256 = Fma.MultiplyAdd( + // Additionally, we're also unrolling two computations per each loop iterations to leverage the + // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. + result256_0 = Fma.MultiplyAdd( Unsafe.As>(ref rowStartRef), Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), - result256); + result256_0); - bufferStart += 2; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + result256_1 = Fma.MultiplyAdd( + Unsafe.As>(ref Unsafe.Add(ref rowStartRef, 2)), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), + result256_1); + + bufferStart += 4; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); + } + + result256_0 = Avx.Add(result256_0, result256_1); + + if ((this.Length & 3) >= 2) + { + result256_0 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256_0); } - Vector128 result128 = Sse.Add(result256.GetLower(), result256.GetUpper()); + Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); if ((this.Length & 1) != 0) { From e68a21de52d1de7c9eaeb234ab50ec4cf470c2fc Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 23:31:32 +0100 Subject: [PATCH 6/8] Add missing indexing update --- .../Processing/Processors/Transforms/Resize/ResizeKernel.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index b537cdfdf9..c79f938d73 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -115,6 +115,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) Unsafe.As>(ref rowStartRef), Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), result256_0); + + bufferStart += 2; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); } Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); From ed4cfaa0ae4165357db4778da198189b8bc7d003 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Wed, 20 Jan 2021 17:39:12 +0100 Subject: [PATCH 7/8] Workaround for incorrect codegen on .NET 5 See Vector256.Create issue: https://github.com/dotnet/runtime/issues/47236 --- .../Processors/Transforms/Resize/ResizeKernel.cs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index c79f938d73..979206ad5c 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -5,6 +5,7 @@ using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -77,7 +78,14 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) float* bufferEnd = bufferStart + (this.Length & ~3); Vector256 result256_0 = Vector256.Zero; Vector256 result256_1 = Vector256.Zero; - var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1); + ReadOnlySpan maskBytes = new byte[] + { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + }; + Vector256 mask = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(maskBytes)); while (bufferStart < bufferEnd) { From 8c7019e41e9a9dfbba63af19859194471a08be3a Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Wed, 20 Jan 2021 21:50:35 +0100 Subject: [PATCH 8/8] Update image threshold for resize tests --- .../Processing/Processors/Transforms/ResizeTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs index f4a94782fd..58b7fd12e8 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs @@ -139,7 +139,7 @@ public void WorkingBufferSizeHintInBytes_IsAppliedCorrectly( testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false); image.CompareToReferenceOutput( - ImageComparer.TolerantPercentage(0.001f), + ImageComparer.TolerantPercentage(0.004f), provider, testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false);