From 1033297a37519b56729b7a5ba54259ba1fcb4de4 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 18:19:31 +0100
Subject: [PATCH 1/8] Add initial FMA resize kernel convolve implementation

---
 .../Transforms/Resize/ResizeKernel.cs         | 58 +++++++++++++++----
 1 file changed, 48 insertions(+), 10 deletions(-)
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index d94aeffe69..bff2c574a6 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -4,6 +4,10 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace SixLabors.ImageSharp.Processing.Processors.Transforms
 {
@@ -66,21 +70,55 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
         [MethodImpl(InliningOptions.ShortMethod)]
         public Vector4 ConvolveCore(ref Vector4 rowStartRef)
         {
-            ref float horizontalValues = ref Unsafe.AsRef<float>(this.bufferPtr);
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Fma.IsSupported)
+            {
+                float* bufferStart = this.bufferPtr;
+                float* bufferEnd = bufferStart + (this.Length & ~1);
+                Vector256<float> result256 = Vector256<float>.Zero;
 
-            // Destination color components
-            Vector4 result = Vector4.Zero;
+                while (bufferStart < bufferEnd)
+                {
+                    Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1]));
 
-            for (int i = 0; i < this.Length; i++)
-            {
-                float weight = Unsafe.Add(ref horizontalValues, i);
+                    result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256);
+
+                    bufferStart += 2;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
+                }
+
+                Vector128<float> result128 = Sse.Add(result256.GetLower(), result256.GetUpper());
+
+                if ((this.Length & 1) != 0)
+                {
+                    Vector128<float> rowItem128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                    var bufferItem128 = Vector128.Create(*bufferStart);
 
-                // Vector4 v = offsetedRowSpan[i];
-                Vector4 v = Unsafe.Add(ref rowStartRef, i);
-                result += v * weight;
+                    result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128);
+                }
+
+                return *(Vector4*)&result128;
             }
+            else
+#endif
+            {
+                // Destination color components
+                Vector4 result = Vector4.Zero;
+                float* bufferStart = this.bufferPtr;
+                float* bufferEnd = this.bufferPtr + this.Length;
+
+                while (bufferStart < bufferEnd)
+                {
+                    // Vector4 v = offsetedRowSpan[i];
+                    result += rowStartRef * *bufferStart;
 
-            return result;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
+                    bufferStart++;
+                }
+
+                return result;
+            }
         }
 
         /// <summary>

From c825eccd10f14eb733cdbe4c75656005afae5aed Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 19:14:51 +0100
Subject: [PATCH 2/8] Improved loading of factors using permutation

Assembly for loading in the loop went from:
```asm
vmovss xmm2, [rax]
vbroadcastss xmm2, xmm2
vmovss xmm3, [rax+4]
vbroadcastss xmm3, xmm3
vinsertf128 ymm2, ymm2, xmm3, 1
```
To:
```asm
vmovsd xmm3, [rax]
vbroadcastsd ymm3, xmm3
vpermps ymm3, ymm1, ymm3
```
---
 .../Processing/Processors/Transforms/Resize/ResizeKernel.cs    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index bff2c574a6..02027f42d8 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -76,11 +76,12 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                 float* bufferStart = this.bufferPtr;
                 float* bufferEnd = bufferStart + (this.Length & ~1);
                 Vector256<float> result256 = Vector256<float>.Zero;
+                var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1);
 
                 while (bufferStart < bufferEnd)
                 {
                     Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                    var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1]));
+                    Vector256<float> bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask);
 
                     result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256);
 

From 1169e73915d98590e82d64f72fa3c2197e00aea9 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 19:33:15 +0100
Subject: [PATCH 3/8] Switch from FMA to AVX2 instructions

---
 .../Processors/Transforms/Resize/ResizeKernel.cs       | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 02027f42d8..5a87d045ea 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -71,7 +71,7 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
         public Vector4 ConvolveCore(ref Vector4 rowStartRef)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Fma.IsSupported)
+            if (Avx2.IsSupported)
             {
                 float* bufferStart = this.bufferPtr;
                 float* bufferEnd = bufferStart + (this.Length & ~1);
@@ -82,8 +82,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                 {
                     Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
                     Vector256<float> bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask);
+                    Vector256<float> multiply256 = Avx.Multiply(rowItem256, bufferItem256);
 
-                    result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256);
+                    result256 = Avx.Add(multiply256, result256);
 
                     bufferStart += 2;
                     rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
@@ -95,8 +96,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                 {
                     Vector128<float> rowItem128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
                     var bufferItem128 = Vector128.Create(*bufferStart);
+                    Vector128<float> multiply128 = Sse.Multiply(rowItem128, bufferItem128);
 
-                    result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128);
+                    result128 = Sse.Add(multiply128, result128);
                 }
 
                 return *(Vector4*)&result128;
@@ -114,8 +116,8 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                     // Vector4 v = offsetedRowSpan[i];
                     result += rowStartRef * *bufferStart;
 
-                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
                     bufferStart++;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
                 }
 
                 return result;

From 0e465cd8c30713b1c3c91966ebef855d4eda314d Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 22:58:43 +0100
Subject: [PATCH 4/8] Revert to FMA, codegen improvements

---
 .../Transforms/Resize/ResizeKernel.cs         | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 5a87d045ea..bd22864bb2 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -71,7 +71,7 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
         public Vector4 ConvolveCore(ref Vector4 rowStartRef)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
+            if (Fma.IsSupported)
             {
                 float* bufferStart = this.bufferPtr;
                 float* bufferEnd = bufferStart + (this.Length & ~1);
@@ -80,11 +80,20 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
 
                 while (bufferStart < bufferEnd)
                 {
-                    Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                    Vector256<float> bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask);
-                    Vector256<float> multiply256 = Avx.Multiply(rowItem256, bufferItem256);
-
-                    result256 = Avx.Add(multiply256, result256);
+                    // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
+                    // for the FMA operation, and execute it directly on the target register and reading directly from
+                    // memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
+                    // The code below should compile in the following assembly on .NET 5 x64:
+                    //
+                    // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
+                    // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
+                    // vfmadd231ps ymm0, ymm2, [r8]     ; result256 = FMA(pixels, factors) + result256
+                    //
+                    // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
+                    result256 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
+                        Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
+                        result256);
 
                     bufferStart += 2;
                     rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
@@ -94,11 +103,10 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
 
                 if ((this.Length & 1) != 0)
                 {
-                    Vector128<float> rowItem128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
-                    var bufferItem128 = Vector128.Create(*bufferStart);
-                    Vector128<float> multiply128 = Sse.Multiply(rowItem128, bufferItem128);
-
-                    result128 = Sse.Add(multiply128, result128);
+                    result128 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
+                        Vector128.Create(*bufferStart),
+                        result128);
                 }
 
                 return *(Vector4*)&result128;

From e0b2defde22343414ee70babe21d1209fb760cbe Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 23:25:47 +0100
Subject: [PATCH 5/8] Add unrolled FMA loop

---
 .../Transforms/Resize/ResizeKernel.cs         | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index bd22864bb2..b537cdfdf9 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -74,8 +74,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
             if (Fma.IsSupported)
             {
                 float* bufferStart = this.bufferPtr;
-                float* bufferEnd = bufferStart + (this.Length & ~1);
-                Vector256<float> result256 = Vector256<float>.Zero;
+                float* bufferEnd = bufferStart + (this.Length & ~3);
+                Vector256<float> result256_0 = Vector256<float>.Zero;
+                Vector256<float> result256_1 = Vector256<float>.Zero;
                 var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1);
 
                 while (bufferStart < bufferEnd)
@@ -87,19 +88,36 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                     //
                     // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
                     // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
-                    // vfmadd231ps ymm0, ymm2, [r8]     ; result256 = FMA(pixels, factors) + result256
+                    // vfmadd231ps ymm0, ymm2, [r8]     ; result256_0 = FMA(pixels, factors) + result256_0
                     //
                     // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
-                    result256 = Fma.MultiplyAdd(
+                    // Additionally, we're also unrolling two computations per each loop iterations to leverage the
+                    // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
+                    result256_0 = Fma.MultiplyAdd(
                         Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
                         Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                        result256);
+                        result256_0);
 
-                    bufferStart += 2;
-                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
+                    result256_1 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
+                        Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
+                        result256_1);
+
+                    bufferStart += 4;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
+                }
+
+                result256_0 = Avx.Add(result256_0, result256_1);
+
+                if ((this.Length & 3) >= 2)
+                {
+                    result256_0 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
+                        Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
+                        result256_0);
                 }
 
-                Vector128<float> result128 = Sse.Add(result256.GetLower(), result256.GetUpper());
+                Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
 
                 if ((this.Length & 1) != 0)
                 {

From e68a21de52d1de7c9eaeb234ab50ec4cf470c2fc Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 23:31:32 +0100
Subject: [PATCH 6/8] Add missing indexing update

---
 .../Processing/Processors/Transforms/Resize/ResizeKernel.cs    | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index b537cdfdf9..c79f938d73 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -115,6 +115,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                         Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
                         Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
                         result256_0);
+
+                    bufferStart += 2;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
                 }
 
                 Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());

From ed4cfaa0ae4165357db4778da198189b8bc7d003 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Wed, 20 Jan 2021 17:39:12 +0100
Subject: [PATCH 7/8] Workaround for incorrect codegen on .NET 5

See Vector256.Create issue: https://github.com/dotnet/runtime/issues/47236
---
 .../Processors/Transforms/Resize/ResizeKernel.cs       | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index c79f938d73..979206ad5c 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -5,6 +5,7 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif
@@ -77,7 +78,14 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                 float* bufferEnd = bufferStart + (this.Length & ~3);
                 Vector256<float> result256_0 = Vector256<float>.Zero;
                 Vector256<float> result256_1 = Vector256<float>.Zero;
-                var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1);
+                ReadOnlySpan<byte> maskBytes = new byte[]
+                {
+                    0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0,
+                    1, 0, 0, 0, 1, 0, 0, 0,
+                    1, 0, 0, 0, 1, 0, 0, 0,
+                };
+                Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
 
                 while (bufferStart < bufferEnd)
                 {

From 8c7019e41e9a9dfbba63af19859194471a08be3a Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Wed, 20 Jan 2021 21:50:35 +0100
Subject: [PATCH 8/8] Update image threshold for resize tests

---
 .../Processing/Processors/Transforms/ResizeTests.cs             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs
index f4a94782fd..58b7fd12e8 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs
@@ -139,7 +139,7 @@ public void WorkingBufferSizeHintInBytes_IsAppliedCorrectly<TPixel>(
                         testOutputDetails: workingBufferLimitInRows,
                         appendPixelTypeToFileName: false);
                     image.CompareToReferenceOutput(
-                        ImageComparer.TolerantPercentage(0.001f),
+                        ImageComparer.TolerantPercentage(0.004f),
                         provider,
                         testOutputDetails: workingBufferLimitInRows,
                         appendPixelTypeToFileName: false);