diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index d94aeffe69..979206ad5c 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -4,6 +4,11 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Processing.Processors.Transforms { @@ -66,21 +71,94 @@ public Vector4 Convolve(Span rowSpan) [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - ref float horizontalValues = ref Unsafe.AsRef(this.bufferPtr); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Fma.IsSupported) + { + float* bufferStart = this.bufferPtr; + float* bufferEnd = bufferStart + (this.Length & ~3); + Vector256 result256_0 = Vector256.Zero; + Vector256 result256_1 = Vector256.Zero; + ReadOnlySpan maskBytes = new byte[] + { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + }; + Vector256 mask = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(maskBytes)); - // Destination color components - Vector4 result = Vector4.Zero; + while (bufferStart < bufferEnd) + { + // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps + // for the FMA operation, and execute it directly on the target register and reading directly from + // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. + // The code below should compile in the following assembly on .NET 5 x64: + // + // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] + // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] + // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 + // + // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. + // Additionally, we're also unrolling two computations per each loop iterations to leverage the + // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. + result256_0 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256_0); - for (int i = 0; i < this.Length; i++) - { - float weight = Unsafe.Add(ref horizontalValues, i); + result256_1 = Fma.MultiplyAdd( + Unsafe.As>(ref Unsafe.Add(ref rowStartRef, 2)), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), + result256_1); + + bufferStart += 4; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); + } + + result256_0 = Avx.Add(result256_0, result256_1); + + if ((this.Length & 3) >= 2) + { + result256_0 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256_0); + + bufferStart += 2; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + } - // Vector4 v = offsetedRowSpan[i]; - Vector4 v = Unsafe.Add(ref rowStartRef, i); - result += v * weight; + Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); + + if ((this.Length & 1) != 0) + { + result128 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Vector128.Create(*bufferStart), + result128); + } + + return *(Vector4*)&result128; } + else +#endif + { + // Destination color components + Vector4 result = Vector4.Zero; + float* bufferStart = this.bufferPtr; + float* bufferEnd = this.bufferPtr + this.Length; - return result; + while (bufferStart < bufferEnd) + { + // Vector4 v = offsetedRowSpan[i]; + result += rowStartRef * *bufferStart; + + bufferStart++; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); + } + + return result; + } } /// diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs index f4a94782fd..58b7fd12e8 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs @@ -139,7 +139,7 @@ public void WorkingBufferSizeHintInBytes_IsAppliedCorrectly( testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false); image.CompareToReferenceOutput( - ImageComparer.TolerantPercentage(0.001f), + ImageComparer.TolerantPercentage(0.004f), provider, testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false);