Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize bokeh blur convolution #1475

Merged
merged 22 commits into from
Dec 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
efeb1c4
Add BokehBlur benchmark
Sergio0694 Dec 12, 2020
d8453bc
Minor code refactoring to improve flexibility
Sergio0694 Dec 12, 2020
faa1ad1
Switched bokeh blur to optimized pipeline
Sergio0694 Dec 12, 2020
6bb300a
Specialize bokeh blur operations for 1D kernels
Sergio0694 Dec 12, 2020
c72a3bb
Minor code tweaks
Sergio0694 Dec 12, 2020
04d66a9
Restore temporary changes
Sergio0694 Dec 12, 2020
b2f397d
Remove unnecessary code
Sergio0694 Dec 12, 2020
3180ec4
Fix gamma processing out of image bounds
Sergio0694 Dec 12, 2020
7db1225
Fix blur processing when constrained to region
Sergio0694 Dec 12, 2020
50f9716
Fix NullReferenceException in KernelSamplingMap.Dispose
Sergio0694 Dec 12, 2020
9fca028
Remove allocation constrained test for bokeh blur
Sergio0694 Dec 13, 2020
2e53a44
Remove unnecessary offset indirections
Sergio0694 Dec 13, 2020
e4ba017
Add optimized paths for default gamma exposure
Sergio0694 Dec 14, 2020
cb18c58
Switch to vectorized clamping
Sergio0694 Dec 14, 2020
5c89d0a
Initial vectorized cube root implementation
Sergio0694 Dec 14, 2020
442e467
Fix vectorized cube root on x86-64 with no SSE41
Sergio0694 Dec 14, 2020
c38fc81
Minor codegen tweaks
Sergio0694 Dec 14, 2020
0d27e04
Add discontigous buffers and intrinsics tests
JimBobSquarePants Dec 14, 2020
8ab5e6f
Fix feature test runner
JimBobSquarePants Dec 14, 2020
c52112c
Switch to explicit SSE Newton approximations
Sergio0694 Dec 14, 2020
9142e72
Add FMA support, more SSE optimizations
Sergio0694 Dec 14, 2020
a8cae3f
Add more codegen improvements
Sergio0694 Dec 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 0 additions & 109 deletions src/ImageSharp/Common/Helpers/Buffer2DUtils.cs

This file was deleted.

135 changes: 135 additions & 0 deletions src/ImageSharp/Common/Helpers/Numerics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -547,5 +547,140 @@ public static void UnPremultiply(Span<Vector4> vectors)
}
}
}

/// <summary>
/// Calculates the cube pow of all the XYZ channels of the input vectors.
/// </summary>
/// <param name="vectors">The span of vectors</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
{
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
ref Vector4 endRef = ref Unsafe.Add(ref baseRef, vectors.Length);

while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
{
Vector4 v = baseRef;
float a = v.W;

// Fast path for the default gamma exposure, which is 3. In this case we can skip
// calling Math.Pow 3 times (one per component), as the method is an internal call and
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
// back to the target index in the temporary span. The whole iteration will get completely
// inlined and traslated into vectorized instructions, with much better performance.
v = v * v * v;
v.W = a;

baseRef = v;
baseRef = ref Unsafe.Add(ref baseRef, 1);
}
}

/// <summary>
/// Calculates the cube root of all the XYZ channels of the input vectors.
/// </summary>
/// <param name="vectors">The span of vectors</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse41.IsSupported)
{
ref Vector128<float> vectors128Ref = ref Unsafe.As<Vector4, Vector128<float>>(ref MemoryMarshal.GetReference(vectors));
ref Vector128<float> vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length);

var v128_341 = Vector128.Create(341);
Vector128<int> v128_negativeZero = Vector128.Create(-0.0f).AsInt32();
Vector128<int> v128_one = Vector128.Create(1.0f).AsInt32();

var v128_13rd = Vector128.Create(1 / 3f);
var v128_23rds = Vector128.Create(2 / 3f);

while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End))
{
Vector128<float> vecx = vectors128Ref;
Vector128<int> veax = vecx.AsInt32();

// If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
// directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
// https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
// Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
// using a different succession (the same used below), which should be less unstable due to not having cube pow.
veax = Sse2.AndNot(v128_negativeZero, veax);
veax = Sse2.Subtract(veax, v128_one);
veax = Sse2.ShiftRightArithmetic(veax, 10);
veax = Sse41.MultiplyLow(veax, v128_341);
veax = Sse2.Add(veax, v128_one);
veax = Sse2.AndNot(v128_negativeZero, veax);
veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero));

Vector128<float> y4 = veax.AsSingle();

if (Fma.IsSupported)
{
y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
}
else
{
y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
}

y4 = Sse41.Insert(y4, vecx, 0xF0);

vectors128Ref = y4;
vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1);
}

return;
}
#endif
ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);

// Fallback with scalar preprocessing and vectorized approximation steps
while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
{
Vector4 v = vectorsRef;

double
x64 = v.X,
y64 = v.Y,
z64 = v.Z;
float a = v.W;

ulong
xl = *(ulong*)&x64,
yl = *(ulong*)&y64,
zl = *(ulong*)&z64;

// Here we use a trick to compute the starting value x0 for the cube root. This is because doing
// pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
// this means what we actually want is to find the cube root of our clamped values.
// For more info on the constant below, see:
// https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
// Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
// store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
// register, and use it to accelerate two steps of the Newton approximation using SIMD.
xl = 0x2a9f8a7be393b600 + (xl / 3);
yl = 0x2a9f8a7be393b600 + (yl / 3);
zl = 0x2a9f8a7be393b600 + (zl / 3);

Vector4 y4;
y4.X = (float)*(double*)&xl;
y4.Y = (float)*(double*)&yl;
y4.Z = (float)*(double*)&zl;
y4.W = 0;

y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
y4.W = a;

vectorsRef = y4;
vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
Expand Down Expand Up @@ -91,31 +92,30 @@ public IImageProcessor<TPixel> CreatePixelSpecificProcessor<TPixel>(Configuratio
/// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only
/// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type.
/// </remarks>
internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation
internal readonly struct SecondPassConvolutionRowOperation : IRowOperation
{
private readonly Rectangle bounds;
private readonly Buffer2D<Vector4> targetValues;
private readonly Buffer2D<ComplexVector4> sourceValues;
private readonly KernelSamplingMap map;
private readonly Complex64[] kernel;
private readonly float z;
private readonly float w;
private readonly int maxY;
private readonly int maxX;

[MethodImpl(InliningOptions.ShortMethod)]
public ApplyHorizontalConvolutionRowOperation(
public SecondPassConvolutionRowOperation(
Rectangle bounds,
Buffer2D<Vector4> targetValues,
Buffer2D<ComplexVector4> sourceValues,
KernelSamplingMap map,
Complex64[] kernel,
float z,
float w)
{
this.bounds = bounds;
this.maxY = this.bounds.Bottom - 1;
this.maxX = this.bounds.Right - 1;
this.targetValues = targetValues;
this.sourceValues = sourceValues;
this.map = map;
this.kernel = kernel;
this.z = z;
this.w = w;
Expand All @@ -125,11 +125,33 @@ public ApplyHorizontalConvolutionRowOperation(
[MethodImpl(InliningOptions.ShortMethod)]
public void Invoke(int y)
{
Span<Vector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
int boundsX = this.bounds.X;
int boundsWidth = this.bounds.Width;
int kernelSize = this.kernel.Length;

for (int x = 0; x < this.bounds.Width; x++)
Span<int> rowOffsets = this.map.GetRowOffsetSpan();
ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);

// The target buffer is zeroed initially and then it accumulates the results
// of each partial convolution, so we don't have to clear it here as well
ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
ref Complex64 kernelBase = ref this.kernel[0];

for (int kY = 0; kY < kernelSize; kY++)
{
Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w);
// Get the precalculated source sample row for this kernel row and copy to our buffer
int sampleY = Unsafe.Add(ref sampleRowBase, kY);
ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
Complex64 factor = Unsafe.Add(ref kernelBase, kY);

for (int x = 0; x < boundsWidth; x++)
{
ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
ComplexVector4 sample = Unsafe.Add(ref sourceBase, x);
ComplexVector4 partial = factor * sample;

target += partial.WeightedSum(this.z, this.w);
}
}
}
}
Expand Down
Loading