Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize bokeh blur convolution #1475

Merged
merged 22 commits into from
Dec 15, 2020
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
efeb1c4
Add BokehBlur benchmark
Sergio0694 Dec 12, 2020
d8453bc
Minor code refactoring to improve flexibility
Sergio0694 Dec 12, 2020
faa1ad1
Switched bokeh blur to optimized pipeline
Sergio0694 Dec 12, 2020
6bb300a
Specialize bokeh blur operations for 1D kernels
Sergio0694 Dec 12, 2020
c72a3bb
Minor code tweaks
Sergio0694 Dec 12, 2020
04d66a9
Restore temporary changes
Sergio0694 Dec 12, 2020
b2f397d
Remove unnecessary code
Sergio0694 Dec 12, 2020
3180ec4
Fix gamma processing out of image bounds
Sergio0694 Dec 12, 2020
7db1225
Fix blur processing when constrained to region
Sergio0694 Dec 12, 2020
50f9716
Fix NullReferenceException in KernelSamplingMap.Dispose
Sergio0694 Dec 12, 2020
9fca028
Remove allocation constrained test for bokeh blur
Sergio0694 Dec 13, 2020
2e53a44
Remove unnecessary offset indirections
Sergio0694 Dec 13, 2020
e4ba017
Add optimized paths for default gamma exposure
Sergio0694 Dec 14, 2020
cb18c58
Switch to vectorized clamping
Sergio0694 Dec 14, 2020
5c89d0a
Initial vectorized cube root implementation
Sergio0694 Dec 14, 2020
442e467
Fix vectorized cube root on x86-64 with no SSE41
Sergio0694 Dec 14, 2020
c38fc81
Minor codegen tweaks
Sergio0694 Dec 14, 2020
0d27e04
Add discontigous buffers and intrinsics tests
JimBobSquarePants Dec 14, 2020
8ab5e6f
Fix feature test runner
JimBobSquarePants Dec 14, 2020
c52112c
Switch to explicit SSE Newton approximations
Sergio0694 Dec 14, 2020
9142e72
Add FMA support, more SSE optimizations
Sergio0694 Dec 14, 2020
a8cae3f
Add more codegen improvements
Sergio0694 Dec 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 0 additions & 109 deletions src/ImageSharp/Common/Helpers/Buffer2DUtils.cs

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
Expand Down Expand Up @@ -91,31 +92,30 @@ public IImageProcessor<TPixel> CreatePixelSpecificProcessor<TPixel>(Configuratio
/// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only
/// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type.
/// </remarks>
internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation
internal readonly struct SecondPassConvolutionRowOperation : IRowOperation
{
private readonly Rectangle bounds;
private readonly Buffer2D<Vector4> targetValues;
private readonly Buffer2D<ComplexVector4> sourceValues;
private readonly KernelSamplingMap map;
private readonly Complex64[] kernel;
private readonly float z;
private readonly float w;
private readonly int maxY;
private readonly int maxX;

[MethodImpl(InliningOptions.ShortMethod)]
public ApplyHorizontalConvolutionRowOperation(
public SecondPassConvolutionRowOperation(
Rectangle bounds,
Buffer2D<Vector4> targetValues,
Buffer2D<ComplexVector4> sourceValues,
KernelSamplingMap map,
Complex64[] kernel,
float z,
float w)
{
this.bounds = bounds;
this.maxY = this.bounds.Bottom - 1;
this.maxX = this.bounds.Right - 1;
this.targetValues = targetValues;
this.sourceValues = sourceValues;
this.map = map;
this.kernel = kernel;
this.z = z;
this.w = w;
Expand All @@ -125,11 +125,36 @@ public ApplyHorizontalConvolutionRowOperation(
[MethodImpl(InliningOptions.ShortMethod)]
public void Invoke(int y)
{
Span<Vector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
int boundsX = this.bounds.X;
int boundsWidth = this.bounds.Width;
int kernelSize = this.kernel.Length;

for (int x = 0; x < this.bounds.Width; x++)
Span<int> rowOffsets = this.map.GetRowOffsetSpan();
Span<int> columnOffsets = this.map.GetColumnOffsetSpan();
ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets);

// The target buffer is zeroed initially and then it accumulates the results
// of each partial convolution, so we don't have to clear it here as well
ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
ref Complex64 kernelBase = ref this.kernel[0];

for (int kY = 0; kY < kernelSize; kY++)
{
Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w);
// Get the precalculated source sample row for this kernel row and copy to our buffer
int sampleY = Unsafe.Add(ref sampleRowBase, kY);
ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
Complex64 factor = Unsafe.Add(ref kernelBase, kY);

for (int x = 0; x < boundsWidth; x++)
{
int sampleX = Unsafe.Add(ref sampleColumnBase, x) - boundsX;
ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX);
ComplexVector4 partial = factor * sample;

target += partial.WeightedSum(this.z, this.w);
}
}
}
}
Expand Down
Loading