From 8e67153965a336e1734483a9f646e41dd9ae61de Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 18:35:37 +0100 Subject: [PATCH 01/12] Port horizontal convolution processor, remove Y loop --- .../Convolution2PassProcessor{TPixel}.cs | 149 +++++++++++++++++- 1 file changed, 147 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index 16ce0fdd75..ba4e0a6ad6 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -1,7 +1,10 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -69,7 +72,7 @@ protected override void OnFrameApply(ImageFrame source) mapX.BuildSamplingOffsetMap(this.KernelX, interest); // Horizontal convolution - var horizontalOperation = new ConvolutionRowOperation( + var horizontalOperation = new HorizontalConvolutionRowOperation( interest, firstPassPixels, source.PixelBuffer, @@ -78,7 +81,7 @@ protected override void OnFrameApply(ImageFrame source) this.Configuration, this.PreserveAlpha); - ParallelRowIterator.IterateRows, Vector4>( + ParallelRowIterator.IterateRows( this.Configuration, operationBounds, in horizontalOperation); @@ -104,5 +107,147 @@ protected override void OnFrameApply(ImageFrame source) in verticalOperation); } } + + /// + /// A implementing the logic for the horizontal 1D convolution. + /// + internal readonly struct HorizontalConvolutionRowOperation : IRowOperation + { + private readonly Rectangle bounds; + private readonly Buffer2D targetPixels; + private readonly Buffer2D sourcePixels; + private readonly KernelSamplingMap map; + private readonly DenseMatrix kernelMatrix; + private readonly Configuration configuration; + private readonly bool preserveAlpha; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public HorizontalConvolutionRowOperation( + Rectangle bounds, + Buffer2D targetPixels, + Buffer2D sourcePixels, + KernelSamplingMap map, + DenseMatrix kernelMatrix, + Configuration configuration, + bool preserveAlpha) + { + this.bounds = bounds; + this.targetPixels = targetPixels; + this.sourcePixels = sourcePixels; + this.map = map; + this.kernelMatrix = kernelMatrix; + this.configuration = configuration; + this.preserveAlpha = preserveAlpha; + } + + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Invoke(int y, Span span) + { + if (this.preserveAlpha) + { + this.Convolve3(y, span); + } + else + { + this.Convolve4(y, span); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Convolve3(int y, Span span) + { + // Span is 2x bounds. + int boundsX = this.bounds.X; + int boundsWidth = this.bounds.Width; + Span sourceBuffer = span.Slice(0, this.bounds.Width); + Span targetBuffer = span.Slice(this.bounds.Width); + + var state = new ConvolutionState(in this.kernelMatrix, this.map); + ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + + // Clear the target buffer for each row run. + targetBuffer.Clear(); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + + // Get the precalculated source sample row for this kernel row and copy to our buffer. + ReadOnlyKernel kernel = state.Kernel; + int sampleY = Unsafe.Add(ref sampleRowBase, 0); + Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); + + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + + for (int x = 0; x < sourceBuffer.Length; x++) + { + ref int sampleColumnBase = ref state.GetSampleColumn(x); + ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + + for (int kX = 0; kX < kernel.Columns; kX++) + { + int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; + Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + target += kernel[0, kX] * sample; + } + } + + // Now we need to copy the original alpha values from the source row. + sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); + + for (int x = 0; x < sourceRow.Length; x++) + { + ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W; + } + + Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); + PixelOperations.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Convolve4(int y, Span span) + { + // Span is 2x bounds. + int boundsX = this.bounds.X; + int boundsWidth = this.bounds.Width; + Span sourceBuffer = span.Slice(0, this.bounds.Width); + Span targetBuffer = span.Slice(this.bounds.Width); + + var state = new ConvolutionState(in this.kernelMatrix, this.map); + ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + + // Clear the target buffer for each row run. + targetBuffer.Clear(); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + + // Get the precalculated source sample row for this kernel row and copy to our buffer. + ReadOnlyKernel kernel = state.Kernel; + int sampleY = Unsafe.Add(ref sampleRowBase, 0); + Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); + + Numerics.Premultiply(sourceBuffer); + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + + for (int x = 0; x < sourceBuffer.Length; x++) + { + ref int sampleColumnBase = ref state.GetSampleColumn(x); + ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + + for (int kX = 0; kX < kernel.Columns; kX++) + { + int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; + Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + target += kernel[0, kX] * sample; + } + } + + Numerics.UnPremultiply(targetBuffer); + + Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); + PixelOperations.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow); + } + } } } From a618b760d7e06348397e8c5277a7a6b5cc85a014 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 18:37:56 +0100 Subject: [PATCH 02/12] Port vertical convolution processor, remove X loop --- .../Convolution2PassProcessor{TPixel}.cs | 147 +++++++++++++++++- 1 file changed, 145 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index ba4e0a6ad6..7a472a207f 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -92,7 +92,7 @@ protected override void OnFrameApply(ImageFrame source) mapY.BuildSamplingOffsetMap(this.KernelY, interest); // Vertical convolution - var verticalOperation = new ConvolutionRowOperation( + var verticalOperation = new VerticalConvolutionRowOperation( interest, source.PixelBuffer, firstPassPixels, @@ -101,7 +101,7 @@ protected override void OnFrameApply(ImageFrame source) this.Configuration, this.PreserveAlpha); - ParallelRowIterator.IterateRows, Vector4>( + ParallelRowIterator.IterateRows( this.Configuration, operationBounds, in verticalOperation); @@ -249,5 +249,148 @@ private void Convolve4(int y, Span span) PixelOperations.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow); } } + + /// + /// A implementing the logic for the vertical 1D convolution. + /// + internal readonly struct VerticalConvolutionRowOperation : IRowOperation + { + private readonly Rectangle bounds; + private readonly Buffer2D targetPixels; + private readonly Buffer2D sourcePixels; + private readonly KernelSamplingMap map; + private readonly DenseMatrix kernelMatrix; + private readonly Configuration configuration; + private readonly bool preserveAlpha; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public VerticalConvolutionRowOperation( + Rectangle bounds, + Buffer2D targetPixels, + Buffer2D sourcePixels, + KernelSamplingMap map, + DenseMatrix kernelMatrix, + Configuration configuration, + bool preserveAlpha) + { + this.bounds = bounds; + this.targetPixels = targetPixels; + this.sourcePixels = sourcePixels; + this.map = map; + this.kernelMatrix = kernelMatrix; + this.configuration = configuration; + this.preserveAlpha = preserveAlpha; + } + + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Invoke(int y, Span span) + { + if (this.preserveAlpha) + { + this.Convolve3(y, span); + } + else + { + this.Convolve4(y, span); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Convolve3(int y, Span span) + { + // Span is 2x bounds. + int boundsX = this.bounds.X; + int boundsWidth = this.bounds.Width; + Span sourceBuffer = span.Slice(0, this.bounds.Width); + Span targetBuffer = span.Slice(this.bounds.Width); + + var state = new ConvolutionState(in this.kernelMatrix, this.map); + ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + + // Clear the target buffer for each row run. + targetBuffer.Clear(); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + + ReadOnlyKernel kernel = state.Kernel; + Span sourceRow; + for (int kY = 0; kY < kernel.Rows; kY++) + { + // Get the precalculated source sample row for this kernel row and copy to our buffer. + int sampleY = Unsafe.Add(ref sampleRowBase, kY); + sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); + + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + + for (int x = 0; x < sourceBuffer.Length; x++) + { + ref int sampleColumnBase = ref state.GetSampleColumn(x); + ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + int sampleX = Unsafe.Add(ref sampleColumnBase, 0) - boundsX; + Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + + target += kernel[kY, 0] * sample; + } + } + + // Now we need to copy the original alpha values from the source row. + sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); + + for (int x = 0; x < sourceRow.Length; x++) + { + ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W; + } + + Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); + PixelOperations.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Convolve4(int y, Span span) + { + // Span is 2x bounds. + int boundsX = this.bounds.X; + int boundsWidth = this.bounds.Width; + Span sourceBuffer = span.Slice(0, this.bounds.Width); + Span targetBuffer = span.Slice(this.bounds.Width); + + var state = new ConvolutionState(in this.kernelMatrix, this.map); + ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + + // Clear the target buffer for each row run. + targetBuffer.Clear(); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + + ReadOnlyKernel kernel = state.Kernel; + for (int kY = 0; kY < kernel.Rows; kY++) + { + // Get the precalculated source sample row for this kernel row and copy to our buffer. + int sampleY = Unsafe.Add(ref sampleRowBase, kY); + Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); + + Numerics.Premultiply(sourceBuffer); + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + + for (int x = 0; x < sourceBuffer.Length; x++) + { + ref int sampleColumnBase = ref state.GetSampleColumn(x); + ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + int sampleX = Unsafe.Add(ref sampleColumnBase, 0) - boundsX; + Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + + target += kernel[kY, 0] * sample; + } + } + + Numerics.UnPremultiply(targetBuffer); + + Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); + PixelOperations.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow); + } + } } } From f52802d996635c699a02cefda1085b92d41409c8 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 18:50:34 +0100 Subject: [PATCH 03/12] Remove unnecessary inner loop coordinate sampling --- .../Convolution2PassProcessor{TPixel}.cs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index 7a472a207f..3b9130f3b9 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -164,7 +164,6 @@ private void Convolve3(int y, Span span) Span targetBuffer = span.Slice(this.bounds.Width); var state = new ConvolutionState(in this.kernelMatrix, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); // Clear the target buffer for each row run. targetBuffer.Clear(); @@ -172,8 +171,7 @@ private void Convolve3(int y, Span span) // Get the precalculated source sample row for this kernel row and copy to our buffer. ReadOnlyKernel kernel = state.Kernel; - int sampleY = Unsafe.Add(ref sampleRowBase, 0); - Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); @@ -215,7 +213,6 @@ private void Convolve4(int y, Span span) Span targetBuffer = span.Slice(this.bounds.Width); var state = new ConvolutionState(in this.kernelMatrix, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); // Clear the target buffer for each row run. targetBuffer.Clear(); @@ -223,8 +220,7 @@ private void Convolve4(int y, Span span) // Get the precalculated source sample row for this kernel row and copy to our buffer. ReadOnlyKernel kernel = state.Kernel; - int sampleY = Unsafe.Add(ref sampleRowBase, 0); - Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); Numerics.Premultiply(sourceBuffer); @@ -325,10 +321,8 @@ private void Convolve3(int y, Span span) for (int x = 0; x < sourceBuffer.Length; x++) { - ref int sampleColumnBase = ref state.GetSampleColumn(x); ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - int sampleX = Unsafe.Add(ref sampleColumnBase, 0) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + Vector4 sample = Unsafe.Add(ref sourceBase, x); target += kernel[kY, 0] * sample; } @@ -377,10 +371,8 @@ private void Convolve4(int y, Span span) for (int x = 0; x < sourceBuffer.Length; x++) { - ref int sampleColumnBase = ref state.GetSampleColumn(x); ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - int sampleX = Unsafe.Add(ref sampleColumnBase, 0) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + Vector4 sample = Unsafe.Add(ref sourceBase, x); target += kernel[kY, 0] * sample; } From a9c165294e3424862b299cf12a1eedaef7d138b9 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 18:52:10 +0100 Subject: [PATCH 04/12] Switch to shared sampling map for convolution passes --- .../Convolution2PassProcessor{TPixel}.cs | 74 +++++++++---------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index 3b9130f3b9..c2f3ec59a1 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -67,45 +67,41 @@ protected override void OnFrameApply(ImageFrame source) // for source and target bulk pixel conversion. var operationBounds = new Rectangle(interest.X, interest.Y, interest.Width * 2, interest.Height); - using (var mapX = new KernelSamplingMap(this.Configuration.MemoryAllocator)) - { - mapX.BuildSamplingOffsetMap(this.KernelX, interest); - - // Horizontal convolution - var horizontalOperation = new HorizontalConvolutionRowOperation( - interest, - firstPassPixels, - source.PixelBuffer, - mapX, - this.KernelX, - this.Configuration, - this.PreserveAlpha); - - ParallelRowIterator.IterateRows( - this.Configuration, - operationBounds, - in horizontalOperation); - } - - using (var mapY = new KernelSamplingMap(this.Configuration.MemoryAllocator)) - { - mapY.BuildSamplingOffsetMap(this.KernelY, interest); - - // Vertical convolution - var verticalOperation = new VerticalConvolutionRowOperation( - interest, - source.PixelBuffer, - firstPassPixels, - mapY, - this.KernelY, - this.Configuration, - this.PreserveAlpha); - - ParallelRowIterator.IterateRows( - this.Configuration, - operationBounds, - in verticalOperation); - } + // We can create a single sampling map with the size as if we were using the non separated 2D kernel + // the two 1D kernels represent, and reuse it across both convolution steps, like in the bokeh blur. + using var mapXY = new KernelSamplingMap(this.Configuration.MemoryAllocator); + + mapXY.BuildSamplingOffsetMap(this.KernelY.Rows, this.KernelX.Columns, interest); + + // Horizontal convolution + var horizontalOperation = new HorizontalConvolutionRowOperation( + interest, + firstPassPixels, + source.PixelBuffer, + mapXY, + this.KernelX, + this.Configuration, + this.PreserveAlpha); + + ParallelRowIterator.IterateRows( + this.Configuration, + operationBounds, + in horizontalOperation); + + // Vertical convolution + var verticalOperation = new VerticalConvolutionRowOperation( + interest, + source.PixelBuffer, + firstPassPixels, + mapXY, + this.KernelY, + this.Configuration, + this.PreserveAlpha); + + ParallelRowIterator.IterateRows( + this.Configuration, + operationBounds, + in verticalOperation); } /// From e60827f3d76cd65ee9486390266ef739b457c560 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 19:13:18 +0100 Subject: [PATCH 05/12] Remove convolution state, more optimizations --- .../Convolution/BokehBlurProcessor.cs | 3 +- .../Convolution2PassProcessor{TPixel}.cs | 60 ++++++++++++------- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs index d4fb27a57f..55cef5df54 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs @@ -129,8 +129,7 @@ public void Invoke(int y) int boundsWidth = this.bounds.Width; int kernelSize = this.kernel.Length; - Span rowOffsets = this.map.GetRowOffsetSpan(); - ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize); + ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(this.map.GetRowOffsetSpan()), (y - this.bounds.Y) * kernelSize); // The target buffer is zeroed initially and then it accumulates the results // of each partial convolution, so we don't have to clear it here as well diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index c2f3ec59a1..c7f5c94dd2 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -156,33 +156,38 @@ private void Convolve3(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; + int kernelSize = this.kernelMatrix.Columns; + Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); - var state = new ConvolutionState(in this.kernelMatrix, this.map); - // Clear the target buffer for each row run. targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); // Get the precalculated source sample row for this kernel row and copy to our buffer. - ReadOnlyKernel kernel = state.Kernel; Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + ref float kernelBase = ref this.kernelMatrix[0, 0]; + ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); for (int x = 0; x < sourceBuffer.Length; x++) { - ref int sampleColumnBase = ref state.GetSampleColumn(x); ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - for (int kX = 0; kX < kernel.Columns; kX++) + for (int kX = 0; kX < kernelSize; kX++) { int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - target += kernel[0, kX] * sample; + float factor = Unsafe.Add(ref kernelBase, kX); + + target += factor * sample; } + + // Shift the base column sampling reference by one row + sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize); } // Now we need to copy the original alpha values from the source row. @@ -205,34 +210,39 @@ private void Convolve4(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; + int kernelSize = this.kernelMatrix.Columns; + Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); - var state = new ConvolutionState(in this.kernelMatrix, this.map); - // Clear the target buffer for each row run. targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); // Get the precalculated source sample row for this kernel row and copy to our buffer. - ReadOnlyKernel kernel = state.Kernel; Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); Numerics.Premultiply(sourceBuffer); + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + ref float kernelBase = ref this.kernelMatrix[0, 0]; + ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); for (int x = 0; x < sourceBuffer.Length; x++) { - ref int sampleColumnBase = ref state.GetSampleColumn(x); ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - for (int kX = 0; kX < kernel.Columns; kX++) + for (int kX = 0; kX < kernelSize; kX++) { int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - target += kernel[0, kX] * sample; + float factor = Unsafe.Add(ref kernelBase, kX); + + target += factor * sample; } + + sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize); } Numerics.UnPremultiply(targetBuffer); @@ -294,33 +304,37 @@ private void Convolve3(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; + int kernelSize = this.kernelMatrix.Rows; + Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); - var state = new ConvolutionState(in this.kernelMatrix, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(this.map.GetRowOffsetSpan()), (y - this.bounds.Y) * kernelSize); // Clear the target buffer for each row run. targetBuffer.Clear(); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + ref float kernelBase = ref this.kernelMatrix[0, 0]; - ReadOnlyKernel kernel = state.Kernel; Span sourceRow; - for (int kY = 0; kY < kernel.Rows; kY++) + for (int kY = 0; kY < kernelSize; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer. int sampleY = Unsafe.Add(ref sampleRowBase, kY); sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + float factor = Unsafe.Add(ref kernelBase, kY); for (int x = 0; x < sourceBuffer.Length; x++) { ref Vector4 target = ref Unsafe.Add(ref targetBase, x); Vector4 sample = Unsafe.Add(ref sourceBase, x); - target += kernel[kY, 0] * sample; + target += factor * sample; } } @@ -344,6 +358,8 @@ private void Convolve4(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; + int kernelSize = this.kernelMatrix.Rows; + Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); @@ -352,25 +368,29 @@ private void Convolve4(int y, Span span) // Clear the target buffer for each row run. targetBuffer.Clear(); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + ref float kernelBase = ref this.kernelMatrix[0, 0]; - ReadOnlyKernel kernel = state.Kernel; - for (int kY = 0; kY < kernel.Rows; kY++) + for (int kY = 0; kY < kernelSize; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer. int sampleY = Unsafe.Add(ref sampleRowBase, kY); Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); Numerics.Premultiply(sourceBuffer); + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + float factor = Unsafe.Add(ref kernelBase, kY); for (int x = 0; x < sourceBuffer.Length; x++) { ref Vector4 target = ref Unsafe.Add(ref targetBase, x); Vector4 sample = Unsafe.Add(ref sourceBase, x); - target += kernel[kY, 0] * sample; + target += factor * sample; } } From e57423218628abe26899fbfccaeca5397dcefa32 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 19:20:45 +0100 Subject: [PATCH 06/12] Remove transposed 1D kernels, switch to float[] type --- .../Convolution/BoxBlurProcessor{TPixel}.cs | 23 ++++---- .../Convolution2PassProcessor{TPixel}.cs | 55 ++++++++----------- .../ConvolutionProcessorHelpers.cs | 28 +++++----- .../GaussianBlurProcessor{TPixel}.cs | 14 ++--- .../GaussianSharpenProcessor{TPixel}.cs | 14 ++--- 5 files changed, 54 insertions(+), 80 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BoxBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BoxBlurProcessor{TPixel}.cs index 8c5358770c..5beadb0cee 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BoxBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BoxBlurProcessor{TPixel}.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Processing.Processors.Convolution @@ -23,24 +24,18 @@ public BoxBlurProcessor(Configuration configuration, BoxBlurProcessor definition : base(configuration, source, sourceRectangle) { int kernelSize = (definition.Radius * 2) + 1; - this.KernelX = CreateBoxKernel(kernelSize); - this.KernelY = this.KernelX.Transpose(); + this.Kernel = CreateBoxKernel(kernelSize); } /// - /// Gets the horizontal gradient operator. + /// Gets the 1D convolution kernel. /// - public DenseMatrix KernelX { get; } - - /// - /// Gets the vertical gradient operator. - /// - public DenseMatrix KernelY { get; } + public float[] Kernel { get; } /// protected override void OnFrameApply(ImageFrame source) { - using var processor = new Convolution2PassProcessor(this.Configuration, this.KernelX, this.KernelY, false, this.Source, this.SourceRectangle); + using var processor = new Convolution2PassProcessor(this.Configuration, this.Kernel, false, this.Source, this.SourceRectangle); processor.Apply(source); } @@ -50,10 +45,12 @@ protected override void OnFrameApply(ImageFrame source) /// /// The maximum size of the kernel in either direction. /// The . - private static DenseMatrix CreateBoxKernel(int kernelSize) + private static float[] CreateBoxKernel(int kernelSize) { - var kernel = new DenseMatrix(kernelSize, 1); - kernel.Fill(1F / kernelSize); + var kernel = new float[kernelSize]; + + kernel.AsSpan().Fill(1F / kernelSize); + return kernel; } } diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index c7f5c94dd2..9b7ed75808 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -22,34 +22,26 @@ internal class Convolution2PassProcessor : ImageProcessor /// Initializes a new instance of the class. /// /// The configuration which allows altering default behaviour or extending the library. - /// The horizontal gradient operator. - /// The vertical gradient operator. + /// The 1D convolution kernel. /// Whether the convolution filter is applied to alpha as well as the color channels. /// The source for the current processor instance. /// The source area to process for the current processor instance. public Convolution2PassProcessor( Configuration configuration, - in DenseMatrix kernelX, - in DenseMatrix kernelY, + float[] kernel, bool preserveAlpha, Image source, Rectangle sourceRectangle) : base(configuration, source, sourceRectangle) { - this.KernelX = kernelX; - this.KernelY = kernelY; + this.Kernel = kernel; this.PreserveAlpha = preserveAlpha; } /// - /// Gets the horizontal convolution kernel. + /// Gets the convolution kernel. /// - public DenseMatrix KernelX { get; } - - /// - /// Gets the vertical convolution kernel. - /// - public DenseMatrix KernelY { get; } + public float[] Kernel { get; } /// /// Gets a value indicating whether the convolution filter is applied to alpha as well as the color channels. @@ -71,7 +63,7 @@ protected override void OnFrameApply(ImageFrame source) // the two 1D kernels represent, and reuse it across both convolution steps, like in the bokeh blur. using var mapXY = new KernelSamplingMap(this.Configuration.MemoryAllocator); - mapXY.BuildSamplingOffsetMap(this.KernelY.Rows, this.KernelX.Columns, interest); + mapXY.BuildSamplingOffsetMap(this.Kernel.Length, this.Kernel.Length, interest); // Horizontal convolution var horizontalOperation = new HorizontalConvolutionRowOperation( @@ -79,7 +71,7 @@ protected override void OnFrameApply(ImageFrame source) firstPassPixels, source.PixelBuffer, mapXY, - this.KernelX, + this.Kernel, this.Configuration, this.PreserveAlpha); @@ -94,7 +86,7 @@ protected override void OnFrameApply(ImageFrame source) source.PixelBuffer, firstPassPixels, mapXY, - this.KernelY, + this.Kernel, this.Configuration, this.PreserveAlpha); @@ -113,7 +105,7 @@ protected override void OnFrameApply(ImageFrame source) private readonly Buffer2D targetPixels; private readonly Buffer2D sourcePixels; private readonly KernelSamplingMap map; - private readonly DenseMatrix kernelMatrix; + private readonly float[] kernel; private readonly Configuration configuration; private readonly bool preserveAlpha; @@ -123,7 +115,7 @@ public HorizontalConvolutionRowOperation( Buffer2D targetPixels, Buffer2D sourcePixels, KernelSamplingMap map, - DenseMatrix kernelMatrix, + float[] kernel, Configuration configuration, bool preserveAlpha) { @@ -131,7 +123,7 @@ public HorizontalConvolutionRowOperation( this.targetPixels = targetPixels; this.sourcePixels = sourcePixels; this.map = map; - this.kernelMatrix = kernelMatrix; + this.kernel = kernel; this.configuration = configuration; this.preserveAlpha = preserveAlpha; } @@ -156,7 +148,7 @@ private void Convolve3(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; - int kernelSize = this.kernelMatrix.Columns; + int kernelSize = this.kernel.Length; Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); @@ -170,7 +162,7 @@ private void Convolve3(int y, Span span) PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - ref float kernelBase = ref this.kernelMatrix[0, 0]; + ref float kernelBase = ref this.kernel[0]; ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); for (int x = 0; x < sourceBuffer.Length; x++) @@ -210,7 +202,7 @@ private void Convolve4(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; - int kernelSize = this.kernelMatrix.Columns; + int kernelSize = this.kernel.Length; Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); @@ -226,7 +218,7 @@ private void Convolve4(int y, Span span) Numerics.Premultiply(sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - ref float kernelBase = ref this.kernelMatrix[0, 0]; + ref float kernelBase = ref this.kernel[0]; ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); for (int x = 0; x < sourceBuffer.Length; x++) @@ -261,7 +253,7 @@ private void Convolve4(int y, Span span) private readonly Buffer2D targetPixels; private readonly Buffer2D sourcePixels; private readonly KernelSamplingMap map; - private readonly DenseMatrix kernelMatrix; + private readonly float[] kernel; private readonly Configuration configuration; private readonly bool preserveAlpha; @@ -271,7 +263,7 @@ public VerticalConvolutionRowOperation( Buffer2D targetPixels, Buffer2D sourcePixels, KernelSamplingMap map, - DenseMatrix kernelMatrix, + float[] kernel, Configuration configuration, bool preserveAlpha) { @@ -279,7 +271,7 @@ public VerticalConvolutionRowOperation( this.targetPixels = targetPixels; this.sourcePixels = sourcePixels; this.map = map; - this.kernelMatrix = kernelMatrix; + this.kernel = kernel; this.configuration = configuration; this.preserveAlpha = preserveAlpha; } @@ -304,7 +296,7 @@ private void Convolve3(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; - int kernelSize = this.kernelMatrix.Rows; + int kernelSize = this.kernel.Length; Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); @@ -315,7 +307,7 @@ private void Convolve3(int y, Span span) targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ref float kernelBase = ref this.kernelMatrix[0, 0]; + ref float kernelBase = ref this.kernel[0]; Span sourceRow; for (int kY = 0; kY < kernelSize; kY++) @@ -358,19 +350,18 @@ private void Convolve4(int y, Span span) // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; - int kernelSize = this.kernelMatrix.Rows; + int kernelSize = this.kernel.Length; Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); - var state = new ConvolutionState(in this.kernelMatrix, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(this.map.GetRowOffsetSpan()), (y - this.bounds.Y) * kernelSize); // Clear the target buffer for each row run. targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ref float kernelBase = ref this.kernelMatrix[0, 0]; + ref float kernelBase = ref this.kernel[0]; for (int kY = 0; kY < kernelSize; kY++) { diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessorHelpers.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessorHelpers.cs index 9844f99563..f93cdabc47 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessorHelpers.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessorHelpers.cs @@ -12,17 +12,15 @@ internal static class ConvolutionProcessorHelpers /// See . /// internal static int GetDefaultGaussianRadius(float sigma) - { - return (int)MathF.Ceiling(sigma * 3); - } + => (int)MathF.Ceiling(sigma * 3); /// /// Create a 1 dimensional Gaussian kernel using the Gaussian G(x) function. /// - /// The . - internal static DenseMatrix CreateGaussianBlurKernel(int size, float weight) + /// The convolution kernel. + internal static float[] CreateGaussianBlurKernel(int size, float weight) { - var kernel = new DenseMatrix(size, 1); + var kernel = new float[size]; float sum = 0F; float midpoint = (size - 1) / 2F; @@ -32,13 +30,13 @@ internal static DenseMatrix CreateGaussianBlurKernel(int size, float weig float x = i - midpoint; float gx = Numerics.Gaussian(x, weight); sum += gx; - kernel[0, i] = gx; + kernel[i] = gx; } // Normalize kernel so that the sum of all weights equals 1 for (int i = 0; i < size; i++) { - kernel[0, i] /= sum; + kernel[i] /= sum; } return kernel; @@ -47,10 +45,10 @@ internal static DenseMatrix CreateGaussianBlurKernel(int size, float weig /// /// Create a 1 dimensional Gaussian kernel using the Gaussian G(x) function /// - /// The . - internal static DenseMatrix CreateGaussianSharpenKernel(int size, float weight) + /// The convolution kernel. + internal static float[] CreateGaussianSharpenKernel(int size, float weight) { - var kernel = new DenseMatrix(size, 1); + var kernel = new float[size]; float sum = 0; @@ -60,7 +58,7 @@ internal static DenseMatrix CreateGaussianSharpenKernel(int size, float w float x = i - midpoint; float gx = Numerics.Gaussian(x, weight); sum += gx; - kernel[0, i] = gx; + kernel[i] = gx; } // Invert the kernel for sharpening. @@ -70,19 +68,19 @@ internal static DenseMatrix CreateGaussianSharpenKernel(int size, float w if (i == midpointRounded) { // Calculate central value - kernel[0, i] = (2F * sum) - kernel[0, i]; + kernel[i] = (2F * sum) - kernel[i]; } else { // invert value - kernel[0, i] = -kernel[0, i]; + kernel[i] = -kernel[i]; } } // Normalize kernel so that the sum of all weights equals 1 for (int i = 0; i < size; i++) { - kernel[0, i] /= sum; + kernel[i] /= sum; } return kernel; diff --git a/src/ImageSharp/Processing/Processors/Convolution/GaussianBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/GaussianBlurProcessor{TPixel}.cs index a9b692a015..4ade01f914 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/GaussianBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/GaussianBlurProcessor{TPixel}.cs @@ -27,24 +27,18 @@ public GaussianBlurProcessor( : base(configuration, source, sourceRectangle) { int kernelSize = (definition.Radius * 2) + 1; - this.KernelX = ConvolutionProcessorHelpers.CreateGaussianBlurKernel(kernelSize, definition.Sigma); - this.KernelY = this.KernelX.Transpose(); + this.Kernel = ConvolutionProcessorHelpers.CreateGaussianBlurKernel(kernelSize, definition.Sigma); } /// - /// Gets the horizontal gradient operator. + /// Gets the 1D convolution kernel. /// - public DenseMatrix KernelX { get; } - - /// - /// Gets the vertical gradient operator. - /// - public DenseMatrix KernelY { get; } + public float[] Kernel { get; } /// protected override void OnFrameApply(ImageFrame source) { - using var processor = new Convolution2PassProcessor(this.Configuration, this.KernelX, this.KernelY, false, this.Source, this.SourceRectangle); + using var processor = new Convolution2PassProcessor(this.Configuration, this.Kernel, false, this.Source, this.SourceRectangle); processor.Apply(source); } diff --git a/src/ImageSharp/Processing/Processors/Convolution/GaussianSharpenProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/GaussianSharpenProcessor{TPixel}.cs index 5e20865e5c..73aaaec188 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/GaussianSharpenProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/GaussianSharpenProcessor{TPixel}.cs @@ -27,24 +27,18 @@ public GaussianSharpenProcessor( : base(configuration, source, sourceRectangle) { int kernelSize = (definition.Radius * 2) + 1; - this.KernelX = ConvolutionProcessorHelpers.CreateGaussianSharpenKernel(kernelSize, definition.Sigma); - this.KernelY = this.KernelX.Transpose(); + this.Kernel = ConvolutionProcessorHelpers.CreateGaussianSharpenKernel(kernelSize, definition.Sigma); } /// - /// Gets the horizontal gradient operator. + /// Gets the 1D convolution kernel. /// - public DenseMatrix KernelX { get; } - - /// - /// Gets the vertical gradient operator. - /// - public DenseMatrix KernelY { get; } + public float[] Kernel { get; } /// protected override void OnFrameApply(ImageFrame source) { - using var processor = new Convolution2PassProcessor(this.Configuration, this.KernelX, this.KernelY, false, this.Source, this.SourceRectangle); + using var processor = new Convolution2PassProcessor(this.Configuration, this.Kernel, false, this.Source, this.SourceRectangle); processor.Apply(source); } From 5a383075c0048081602a0280a487461e9f970c60 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 19:23:38 +0100 Subject: [PATCH 07/12] Remove leftover ConvolutionRowOperation type --- .../ConvolutionRowOperation{TPixel}.cs | 163 ------------------ 1 file changed, 163 deletions(-) delete mode 100644 src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs deleted file mode 100644 index 9876b2885b..0000000000 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - -using System; -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using SixLabors.ImageSharp.Advanced; -using SixLabors.ImageSharp.Memory; -using SixLabors.ImageSharp.PixelFormats; - -namespace SixLabors.ImageSharp.Processing.Processors.Convolution -{ - /// - /// A implementing the logic for 1D convolution. - /// - internal readonly struct ConvolutionRowOperation : IRowOperation - where TPixel : unmanaged, IPixel - { - private readonly Rectangle bounds; - private readonly Buffer2D targetPixels; - private readonly Buffer2D sourcePixels; - private readonly KernelSamplingMap map; - private readonly DenseMatrix kernelMatrix; - private readonly Configuration configuration; - private readonly bool preserveAlpha; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public ConvolutionRowOperation( - Rectangle bounds, - Buffer2D targetPixels, - Buffer2D sourcePixels, - KernelSamplingMap map, - DenseMatrix kernelMatrix, - Configuration configuration, - bool preserveAlpha) - { - this.bounds = bounds; - this.targetPixels = targetPixels; - this.sourcePixels = sourcePixels; - this.map = map; - this.kernelMatrix = kernelMatrix; - this.configuration = configuration; - this.preserveAlpha = preserveAlpha; - } - - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void Invoke(int y, Span span) - { - if (this.preserveAlpha) - { - this.Convolve3(y, span); - } - else - { - this.Convolve4(y, span); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void Convolve3(int y, Span span) - { - // Span is 2x bounds. - int boundsX = this.bounds.X; - int boundsWidth = this.bounds.Width; - Span sourceBuffer = span.Slice(0, this.bounds.Width); - Span targetBuffer = span.Slice(this.bounds.Width); - - var state = new ConvolutionState(in this.kernelMatrix, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); - - // Clear the target buffer for each row run. - targetBuffer.Clear(); - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - - ReadOnlyKernel kernel = state.Kernel; - Span sourceRow; - for (int kY = 0; kY < kernel.Rows; kY++) - { - // Get the precalculated source sample row for this kernel row and copy to our buffer. - int sampleY = Unsafe.Add(ref sampleRowBase, kY); - sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); - PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); - - ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - - for (int x = 0; x < sourceBuffer.Length; x++) - { - ref int sampleColumnBase = ref state.GetSampleColumn(x); - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - - for (int kX = 0; kX < kernel.Columns; kX++) - { - int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - target += kernel[kY, kX] * sample; - } - } - } - - // Now we need to copy the original alpha values from the source row. - sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); - PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); - - for (int x = 0; x < sourceRow.Length; x++) - { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W; - } - - Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); - PixelOperations.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void Convolve4(int y, Span span) - { - // Span is 2x bounds. - int boundsX = this.bounds.X; - int boundsWidth = this.bounds.Width; - Span sourceBuffer = span.Slice(0, this.bounds.Width); - Span targetBuffer = span.Slice(this.bounds.Width); - - var state = new ConvolutionState(in this.kernelMatrix, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); - - // Clear the target buffer for each row run. - targetBuffer.Clear(); - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - - ReadOnlyKernel kernel = state.Kernel; - for (int kY = 0; kY < kernel.Rows; kY++) - { - // Get the precalculated source sample row for this kernel row and copy to our buffer. - int sampleY = Unsafe.Add(ref sampleRowBase, kY); - Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); - PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); - - Numerics.Premultiply(sourceBuffer); - ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - - for (int x = 0; x < sourceBuffer.Length; x++) - { - ref int sampleColumnBase = ref state.GetSampleColumn(x); - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - - for (int kX = 0; kX < kernel.Columns; kX++) - { - int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - target += kernel[kY, kX] * sample; - } - } - } - - Numerics.UnPremultiply(targetBuffer); - - Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); - PixelOperations.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow); - } - } -} From e11adc6a33ccf1304bbaaf714d6253285105e657 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 19:37:25 +0100 Subject: [PATCH 08/12] Minor code tweaks --- .../Convolution/Convolution2PassProcessor{TPixel}.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index 9b7ed75808..d407e551ee 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -155,13 +155,13 @@ private void Convolve3(int y, Span span) // Clear the target buffer for each row run. targetBuffer.Clear(); - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); // Get the precalculated source sample row for this kernel row and copy to our buffer. Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); ref float kernelBase = ref this.kernel[0]; ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); @@ -209,7 +209,6 @@ private void Convolve4(int y, Span span) // Clear the target buffer for each row run. targetBuffer.Clear(); - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); // Get the precalculated source sample row for this kernel row and copy to our buffer. Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); @@ -218,6 +217,7 @@ private void Convolve4(int y, Span span) Numerics.Premultiply(sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); ref float kernelBase = ref this.kernel[0]; ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); From cb5c868eda4a5c15423ad1cd3907569ebbbf1a34 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 22:07:12 +0100 Subject: [PATCH 09/12] More performance improvements to 2 pass convolution --- .../Convolution2PassProcessor{TPixel}.cs | 116 +++++++++++------- 1 file changed, 72 insertions(+), 44 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index d407e551ee..365b2e2dfc 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -161,24 +161,28 @@ private void Convolve3(int y, Span span) PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + ref Vector4 targetStart = ref MemoryMarshal.GetReference(targetBuffer); + ref Vector4 targetEnd = ref Unsafe.Add(ref targetStart, sourceBuffer.Length); ref float kernelBase = ref this.kernel[0]; + ref float kernelEnd = ref Unsafe.Add(ref kernelBase, kernelSize); ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); - for (int x = 0; x < sourceBuffer.Length; x++) + while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd)) { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + ref float kernelStart = ref kernelBase; + ref int sampleColumnStart = ref sampleColumnBase; - for (int kX = 0; kX < kernelSize; kX++) + while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd)) { - int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - float factor = Unsafe.Add(ref kernelBase, kX); + Vector4 sample = Unsafe.Add(ref sourceBase, sampleColumnStart - boundsX); + + targetStart += kernelStart * sample; - target += factor * sample; + kernelStart = ref Unsafe.Add(ref kernelStart, 1); + sampleColumnStart = ref Unsafe.Add(ref sampleColumnStart, 1); } - // Shift the base column sampling reference by one row + targetStart = ref Unsafe.Add(ref targetStart, 1); sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize); } @@ -186,10 +190,14 @@ private void Convolve3(int y, Span span) sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); - for (int x = 0; x < sourceRow.Length; x++) + targetStart = ref MemoryMarshal.GetReference(targetBuffer); + + while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd)) { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W; + targetStart.W = sourceBase.W; + + targetStart = ref Unsafe.Add(ref targetStart, 1); + sourceBase = ref Unsafe.Add(ref sourceBase, 1); } Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); @@ -217,23 +225,28 @@ private void Convolve4(int y, Span span) Numerics.Premultiply(sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + ref Vector4 targetStart = ref MemoryMarshal.GetReference(targetBuffer); + ref Vector4 targetEnd = ref Unsafe.Add(ref targetStart, sourceBuffer.Length); ref float kernelBase = ref this.kernel[0]; + ref float kernelEnd = ref Unsafe.Add(ref kernelBase, kernelSize); ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); - for (int x = 0; x < sourceBuffer.Length; x++) + while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd)) { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + ref float kernelStart = ref kernelBase; + ref int sampleColumnStart = ref sampleColumnBase; - for (int kX = 0; kX < kernelSize; kX++) + while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd)) { - int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - float factor = Unsafe.Add(ref kernelBase, kX); + Vector4 sample = Unsafe.Add(ref sourceBase, sampleColumnStart - boundsX); - target += factor * sample; + targetStart += kernelStart * sample; + + kernelStart = ref Unsafe.Add(ref kernelStart, 1); + sampleColumnStart = ref Unsafe.Add(ref sampleColumnStart, 1); } + targetStart = ref Unsafe.Add(ref targetStart, 1); sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize); } @@ -307,37 +320,48 @@ private void Convolve3(int y, Span span) targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ref float kernelBase = ref this.kernel[0]; + ref float kernelStart = ref this.kernel[0]; + ref float kernelEnd = ref Unsafe.Add(ref kernelStart, kernelSize); Span sourceRow; - for (int kY = 0; kY < kernelSize; kY++) + while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd)) { // Get the precalculated source sample row for this kernel row and copy to our buffer. - int sampleY = Unsafe.Add(ref sampleRowBase, kY); - sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + sourceRow = this.sourcePixels.GetRowSpan(sampleRowBase).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - float factor = Unsafe.Add(ref kernelBase, kY); + ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length); + ref Vector4 targetStart = ref targetBase; + float factor = kernelStart; - for (int x = 0; x < sourceBuffer.Length; x++) + while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd)) { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - Vector4 sample = Unsafe.Add(ref sourceBase, x); + targetStart += factor * sourceBase; - target += factor * sample; + sourceBase = ref Unsafe.Add(ref sourceBase, 1); + targetStart = ref Unsafe.Add(ref targetStart, 1); } + + kernelStart = ref Unsafe.Add(ref kernelStart, 1); + sampleRowBase = ref Unsafe.Add(ref sampleRowBase, 1); } // Now we need to copy the original alpha values from the source row. sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); - - for (int x = 0; x < sourceRow.Length; x++) { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W; + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); + ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length); + + while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd)) + { + targetBase.W = sourceBase.W; + + targetBase = ref Unsafe.Add(ref targetBase, 1); + sourceBase = ref Unsafe.Add(ref sourceBase, 1); + } } Span targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); @@ -361,28 +385,32 @@ private void Convolve4(int y, Span span) targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ref float kernelBase = ref this.kernel[0]; + ref float kernelStart = ref this.kernel[0]; + ref float kernelEnd = ref Unsafe.Add(ref kernelStart, kernelSize); - for (int kY = 0; kY < kernelSize; kY++) + Span sourceRow; + while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd)) { // Get the precalculated source sample row for this kernel row and copy to our buffer. - int sampleY = Unsafe.Add(ref sampleRowBase, kY); - Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + sourceRow = this.sourcePixels.GetRowSpan(sampleRowBase).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); - Numerics.Premultiply(sourceBuffer); - ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); - float factor = Unsafe.Add(ref kernelBase, kY); + ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length); + ref Vector4 targetStart = ref targetBase; + float factor = kernelStart; - for (int x = 0; x < sourceBuffer.Length; x++) + while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd)) { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - Vector4 sample = Unsafe.Add(ref sourceBase, x); + targetStart += factor * sourceBase; - target += factor * sample; + sourceBase = ref Unsafe.Add(ref sourceBase, 1); + targetStart = ref Unsafe.Add(ref targetStart, 1); } + + kernelStart = ref Unsafe.Add(ref kernelStart, 1); + sampleRowBase = ref Unsafe.Add(ref sampleRowBase, 1); } Numerics.UnPremultiply(targetBuffer); From 979baf7ffa4627fc8dfb936110db9af3be43d08b Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 22:49:32 +0100 Subject: [PATCH 10/12] More codegen improvements to bokeh blur --- .../Convolution/BokehBlurProcessor.cs | 26 ++++++++++++------- .../Convolution/BokehBlurProcessor{TPixel}.cs | 21 +++++++++------ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs index 55cef5df54..13fe627d16 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs @@ -134,23 +134,29 @@ public void Invoke(int y) // The target buffer is zeroed initially and then it accumulates the results // of each partial convolution, so we don't have to clear it here as well ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y); - ref Complex64 kernelBase = ref this.kernel[0]; + ref Complex64 kernelStart = ref this.kernel[0]; + ref Complex64 kernelEnd = ref Unsafe.Add(ref kernelStart, kernelSize); - for (int kY = 0; kY < kernelSize; kY++) + while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd)) { // Get the precalculated source sample row for this kernel row and copy to our buffer - int sampleY = Unsafe.Add(ref sampleRowBase, kY); - ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY); - Complex64 factor = Unsafe.Add(ref kernelBase, kY); + ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleRowBase); + ref ComplexVector4 sourceEnd = ref Unsafe.Add(ref sourceBase, boundsWidth); + ref Vector4 targetStart = ref targetBase; + Complex64 factor = kernelStart; - for (int x = 0; x < boundsWidth; x++) + while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd)) { - ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - ComplexVector4 sample = Unsafe.Add(ref sourceBase, x); - ComplexVector4 partial = factor * sample; + ComplexVector4 partial = factor * sourceBase; - target += partial.WeightedSum(this.z, this.w); + targetStart += partial.WeightedSum(this.z, this.w); + + sourceBase = ref Unsafe.Add(ref sourceBase, 1); + targetStart = ref Unsafe.Add(ref targetStart, 1); } + + kernelStart = ref Unsafe.Add(ref kernelStart, 1); + sampleRowBase = ref Unsafe.Add(ref sampleRowBase, 1); } } } diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index a21155e10c..241ff9db28 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -233,32 +233,37 @@ public void Invoke(int y, Span span) // Clear the target buffer for each row run Span targetBuffer = this.targetValues.GetRowSpan(y); targetBuffer.Clear(); - ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); // Execute the bulk pixel format conversion for the current row Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, span); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span); + ref ComplexVector4 targetStart = ref MemoryMarshal.GetReference(targetBuffer); + ref ComplexVector4 targetEnd = ref Unsafe.Add(ref targetStart, span.Length); ref Complex64 kernelBase = ref this.kernel[0]; + ref Complex64 kernelEnd = ref Unsafe.Add(ref kernelBase, kernelSize); ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); - for (int x = 0; x < span.Length; x++) + while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd)) { - ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x); + ref Complex64 kernelStart = ref kernelBase; + ref int sampleColumnStart = ref sampleColumnBase; - for (int kX = 0; kX < kernelSize; kX++) + while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd)) { - int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - Complex64 factor = Unsafe.Add(ref kernelBase, kX); + Vector4 sample = Unsafe.Add(ref sourceBase, sampleColumnStart - boundsX); - target.Sum(factor * sample); + targetStart.Sum(kernelStart * sample); + + kernelStart = ref Unsafe.Add(ref kernelStart, 1); + sampleColumnStart = ref Unsafe.Add(ref sampleColumnStart, 1); } // Shift the base column sampling reference by one row at the end of each outer // iteration so that the inner tight loop indexing can skip the multiplication sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize); + targetStart = ref Unsafe.Add(ref targetStart, 1); } } } From 1a3e1e7a66a28d4f1bfd062649e8fabff0ab6beb Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 23:16:26 +0100 Subject: [PATCH 11/12] More codegen improvements to shared methods --- .../ColorSpaces/Companding/SRgbCompanding.cs | 24 ++- src/ImageSharp/Common/Helpers/Numerics.cs | 171 ++++++++++-------- .../Utils/Vector4Converters.Default.cs | 50 ++--- 3 files changed, 138 insertions(+), 107 deletions(-) diff --git a/src/ImageSharp/ColorSpaces/Companding/SRgbCompanding.cs b/src/ImageSharp/ColorSpaces/Companding/SRgbCompanding.cs index 2e212ad19f..9a8b5f0a84 100644 --- a/src/ImageSharp/ColorSpaces/Companding/SRgbCompanding.cs +++ b/src/ImageSharp/ColorSpaces/Companding/SRgbCompanding.cs @@ -1,4 +1,4 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System; @@ -25,12 +25,14 @@ public static class SRgbCompanding [MethodImpl(InliningOptions.ShortMethod)] public static void Expand(Span vectors) { - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length); - for (int i = 0; i < vectors.Length; i++) + while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd)) { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - Expand(ref v); + Expand(ref vectorsStart); + + vectorsStart = ref Unsafe.Add(ref vectorsStart, 1); } } @@ -41,12 +43,14 @@ public static void Expand(Span vectors) [MethodImpl(InliningOptions.ShortMethod)] public static void Compress(Span vectors) { - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length); - for (int i = 0; i < vectors.Length; i++) + while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd)) { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - Compress(ref v); + Compress(ref vectorsStart); + + vectorsStart = ref Unsafe.Add(ref vectorsStart, 1); } } @@ -90,4 +94,4 @@ public static void Compress(ref Vector4 vector) [MethodImpl(InliningOptions.ShortMethod)] public static float Compress(float channel) => channel <= 0.0031308F ? 12.92F * channel : (1.055F * MathF.Pow(channel, 0.416666666666667F)) - 0.055F; } -} \ No newline at end of file +} diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 56ab46c685..99d91168bb 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -41,13 +41,11 @@ public static int GreatestCommonDivisor(int a, int b) /// /// Determine the Least Common Multiple (LCM) of two numbers. + /// See https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int LeastCommonMultiple(int a, int b) - { - // https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor - return (a / GreatestCommonDivisor(a, b)) * b; - } + => a / GreatestCommonDivisor(a, b) * b; /// /// Calculates % 2 @@ -290,10 +288,14 @@ public static void Clamp(Span span, byte min, byte max) if (remainder.Length > 0) { - for (int i = 0; i < remainder.Length; i++) + ref byte remainderStart = ref MemoryMarshal.GetReference(remainder); + ref byte remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length); + + while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd)) { - ref byte v = ref remainder[i]; - v = Clamp(v, min, max); + remainderStart = Clamp(remainderStart, min, max); + + remainderStart = ref Unsafe.Add(ref remainderStart, 1); } } } @@ -311,10 +313,14 @@ public static void Clamp(Span span, uint min, uint max) if (remainder.Length > 0) { - for (int i = 0; i < remainder.Length; i++) + ref uint remainderStart = ref MemoryMarshal.GetReference(remainder); + ref uint remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length); + + while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd)) { - ref uint v = ref remainder[i]; - v = Clamp(v, min, max); + remainderStart = Clamp(remainderStart, min, max); + + remainderStart = ref Unsafe.Add(ref remainderStart, 1); } } } @@ -332,10 +338,14 @@ public static void Clamp(Span span, int min, int max) if (remainder.Length > 0) { - for (int i = 0; i < remainder.Length; i++) + ref int remainderStart = ref MemoryMarshal.GetReference(remainder); + ref int remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length); + + while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd)) { - ref int v = ref remainder[i]; - v = Clamp(v, min, max); + remainderStart = Clamp(remainderStart, min, max); + + remainderStart = ref Unsafe.Add(ref remainderStart, 1); } } } @@ -353,10 +363,14 @@ public static void Clamp(Span span, float min, float max) if (remainder.Length > 0) { - for (int i = 0; i < remainder.Length; i++) + ref float remainderStart = ref MemoryMarshal.GetReference(remainder); + ref float remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length); + + while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd)) { - ref float v = ref remainder[i]; - v = Clamp(v, min, max); + remainderStart = Clamp(remainderStart, min, max); + + remainderStart = ref Unsafe.Add(ref remainderStart, 1); } } } @@ -374,10 +388,14 @@ public static void Clamp(Span span, double min, double max) if (remainder.Length > 0) { - for (int i = 0; i < remainder.Length; i++) + ref double remainderStart = ref MemoryMarshal.GetReference(remainder); + ref double remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length); + + while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd)) { - ref double v = ref remainder[i]; - v = Clamp(v, min, max); + remainderStart = Clamp(remainderStart, min, max); + + remainderStart = ref Unsafe.Add(ref remainderStart, 1); } } } @@ -472,10 +490,8 @@ public static void Premultiply(Span vectors) #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && vectors.Length >= 2) { - ref Vector256 vectorsBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 + ref Vector256 vectorsBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) @@ -495,12 +511,14 @@ public static void Premultiply(Span vectors) else #endif { - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length); - for (int i = 0; i < vectors.Length; i++) + while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd)) { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - Premultiply(ref v); + Premultiply(ref vectorsStart); + + vectorsStart = ref Unsafe.Add(ref vectorsStart, 1); } } } @@ -515,10 +533,8 @@ public static void UnPremultiply(Span vectors) #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && vectors.Length >= 2) { - ref Vector256 vectorsBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 + ref Vector256 vectorsBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) @@ -538,12 +554,14 @@ public static void UnPremultiply(Span vectors) else #endif { - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length); - for (int i = 0; i < vectors.Length; i++) + while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd)) { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - UnPremultiply(ref v); + UnPremultiply(ref vectorsStart); + + vectorsStart = ref Unsafe.Add(ref vectorsStart, 1); } } } @@ -633,53 +651,54 @@ public static unsafe void CubeRootOnXYZ(Span vectors) vectors128Ref = y4; vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1); } - - return; } + else #endif - ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors); - ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length); - - // Fallback with scalar preprocessing and vectorized approximation steps - while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd)) { - Vector4 v = vectorsRef; - - double - x64 = v.X, - y64 = v.Y, - z64 = v.Z; - float a = v.W; + ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length); - ulong - xl = *(ulong*)&x64, - yl = *(ulong*)&y64, - zl = *(ulong*)&z64; - - // Here we use a trick to compute the starting value x0 for the cube root. This is because doing - // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case, - // this means what we actually want is to find the cube root of our clamped values. - // For more info on the constant below, see: - // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. - // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and - // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit - // register, and use it to accelerate two steps of the Newton approximation using SIMD. - xl = 0x2a9f8a7be393b600 + (xl / 3); - yl = 0x2a9f8a7be393b600 + (yl / 3); - zl = 0x2a9f8a7be393b600 + (zl / 3); - - Vector4 y4; - y4.X = (float)*(double*)&xl; - y4.Y = (float)*(double*)&yl; - y4.Z = (float)*(double*)&zl; - y4.W = 0; - - y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); - y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); - y4.W = a; - - vectorsRef = y4; - vectorsRef = ref Unsafe.Add(ref vectorsRef, 1); + // Fallback with scalar preprocessing and vectorized approximation steps + while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd)) + { + Vector4 v = vectorsRef; + + double + x64 = v.X, + y64 = v.Y, + z64 = v.Z; + float a = v.W; + + ulong + xl = *(ulong*)&x64, + yl = *(ulong*)&y64, + zl = *(ulong*)&z64; + + // Here we use a trick to compute the starting value x0 for the cube root. This is because doing + // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case, + // this means what we actually want is to find the cube root of our clamped values. + // For more info on the constant below, see: + // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. + // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and + // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit + // register, and use it to accelerate two steps of the Newton approximation using SIMD. + xl = 0x2a9f8a7be393b600 + (xl / 3); + yl = 0x2a9f8a7be393b600 + (yl / 3); + zl = 0x2a9f8a7be393b600 + (zl / 3); + + Vector4 y4; + y4.X = (float)*(double*)&xl; + y4.Y = (float)*(double*)&yl; + y4.Z = (float)*(double*)&zl; + y4.W = 0; + + y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); + y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); + y4.W = a; + + vectorsRef = y4; + vectorsRef = ref Unsafe.Add(ref vectorsRef, 1); + } } } } diff --git a/src/ImageSharp/PixelFormats/Utils/Vector4Converters.Default.cs b/src/ImageSharp/PixelFormats/Utils/Vector4Converters.Default.cs index 999f6325bc..6b6ff4319f 100644 --- a/src/ImageSharp/PixelFormats/Utils/Vector4Converters.Default.cs +++ b/src/ImageSharp/PixelFormats/Utils/Vector4Converters.Default.cs @@ -88,14 +88,16 @@ private static void UnsafeFromVector4Core( Span destPixels) where TPixel : unmanaged, IPixel { - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); + ref Vector4 sourceStart = ref MemoryMarshal.GetReference(sourceVectors); + ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceStart, sourceVectors.Length); ref TPixel destRef = ref MemoryMarshal.GetReference(destPixels); - for (int i = 0; i < sourceVectors.Length; i++) + while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd)) { - ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); - ref TPixel dp = ref Unsafe.Add(ref destRef, i); - dp.FromVector4(sp); + destRef.FromVector4(sourceStart); + + sourceStart = ref Unsafe.Add(ref sourceStart, 1); + destRef = ref Unsafe.Add(ref destRef, 1); } } @@ -105,14 +107,16 @@ private static void UnsafeToVector4Core( Span destVectors) where TPixel : unmanaged, IPixel { - ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourcePixels); + ref TPixel sourceStart = ref MemoryMarshal.GetReference(sourcePixels); + ref TPixel sourceEnd = ref Unsafe.Add(ref sourceStart, sourcePixels.Length); ref Vector4 destRef = ref MemoryMarshal.GetReference(destVectors); - for (int i = 0; i < sourcePixels.Length; i++) + while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd)) { - ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); - ref Vector4 dp = ref Unsafe.Add(ref destRef, i); - dp = sp.ToVector4(); + destRef = sourceStart.ToVector4(); + + sourceStart = ref Unsafe.Add(ref sourceStart, 1); + destRef = ref Unsafe.Add(ref destRef, 1); } } @@ -122,14 +126,16 @@ private static void UnsafeFromScaledVector4Core( Span destinationColors) where TPixel : unmanaged, IPixel { - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); + ref Vector4 sourceStart = ref MemoryMarshal.GetReference(sourceVectors); + ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceStart, sourceVectors.Length); ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); - for (int i = 0; i < sourceVectors.Length; i++) + while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd)) { - ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); - ref TPixel dp = ref Unsafe.Add(ref destRef, i); - dp.FromScaledVector4(sp); + destRef.FromScaledVector4(sourceStart); + + sourceStart = ref Unsafe.Add(ref sourceStart, 1); + destRef = ref Unsafe.Add(ref destRef, 1); } } @@ -139,16 +145,18 @@ private static void UnsafeToScaledVector4Core( Span destinationVectors) where TPixel : unmanaged, IPixel { - ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); + ref TPixel sourceStart = ref MemoryMarshal.GetReference(sourceColors); + ref TPixel sourceEnd = ref Unsafe.Add(ref sourceStart, sourceColors.Length); ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); - for (int i = 0; i < sourceColors.Length; i++) + while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd)) { - ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); - ref Vector4 dp = ref Unsafe.Add(ref destRef, i); - dp = sp.ToScaledVector4(); + destRef = sourceStart.ToScaledVector4(); + + sourceStart = ref Unsafe.Add(ref sourceStart, 1); + destRef = ref Unsafe.Add(ref destRef, 1); } } } } -} \ No newline at end of file +} From 56015595d760f58ce8c5ce9720eced40b6ae86a1 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 15 Dec 2020 23:52:44 +0100 Subject: [PATCH 12/12] Codegen improvements to Numerics.Clamp --- src/ImageSharp/Common/Helpers/Numerics.cs | 27 +++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 99d91168bb..b9ccfafe0e 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -425,7 +425,6 @@ private static void ClampImpl(Span span, T min, T max) where T : unmanaged { ref T sRef = ref MemoryMarshal.GetReference(span); - ref Vector vsBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(span)); var vmin = new Vector(min); var vmax = new Vector(max); @@ -433,25 +432,35 @@ private static void ClampImpl(Span span, T min, T max) int m = Modulo4(n); int u = n - m; - for (int i = 0; i < u; i += 4) - { - ref Vector vs0 = ref Unsafe.Add(ref vsBase, i); - ref Vector vs1 = ref Unsafe.Add(ref vs0, 1); - ref Vector vs2 = ref Unsafe.Add(ref vs0, 2); - ref Vector vs3 = ref Unsafe.Add(ref vs0, 3); + ref Vector vs0 = ref Unsafe.As>(ref MemoryMarshal.GetReference(span)); + ref Vector vs1 = ref Unsafe.Add(ref vs0, 1); + ref Vector vs2 = ref Unsafe.Add(ref vs0, 2); + ref Vector vs3 = ref Unsafe.Add(ref vs0, 3); + ref Vector vsEnd = ref Unsafe.Add(ref vs0, u); + while (Unsafe.IsAddressLessThan(ref vs0, ref vsEnd)) + { vs0 = Vector.Min(Vector.Max(vmin, vs0), vmax); vs1 = Vector.Min(Vector.Max(vmin, vs1), vmax); vs2 = Vector.Min(Vector.Max(vmin, vs2), vmax); vs3 = Vector.Min(Vector.Max(vmin, vs3), vmax); + + vs0 = ref Unsafe.Add(ref vs0, 4); + vs1 = ref Unsafe.Add(ref vs1, 4); + vs2 = ref Unsafe.Add(ref vs2, 4); + vs3 = ref Unsafe.Add(ref vs3, 4); } if (m > 0) { - for (int i = u; i < n; i++) + vs0 = ref vsEnd; + vsEnd = ref Unsafe.Add(ref vsEnd, m); + + while (Unsafe.IsAddressLessThan(ref vs0, ref vsEnd)) { - ref Vector vs0 = ref Unsafe.Add(ref vsBase, i); vs0 = Vector.Min(Vector.Max(vmin, vs0), vmax); + + vs0 = ref Unsafe.Add(ref vs0, 1); } } }