Skip to content

Commit

Permalink
Add optimized paths for default gamma exposure
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergio0694 committed Dec 14, 2020
1 parent 2e53a44 commit e4ba017
Showing 1 changed file with 159 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,26 +80,46 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());

// Preliminary gamma highlight pass
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
this.Configuration,
sourceRectangle,
in gammaOperation);
if (this.gamma == 3F)
{
var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration);
ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>(
this.Configuration,
sourceRectangle,
in gammaOperation);
}
else
{
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
this.Configuration,
sourceRectangle,
in gammaOperation);
}

// Create a 0-filled buffer to use to store the result of the component convolutions
using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean);

// Perform the 1D convolutions on all the kernel components and accumulate the results
this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);

float inverseGamma = 1 / this.gamma;

// Apply the inverse gamma exposure pass, and write the final pixel data
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
ParallelRowIterator.IterateRows(
this.Configuration,
sourceRectangle,
in operation);
if (this.gamma == 3F)
{
var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
ParallelRowIterator.IterateRows(
this.Configuration,
sourceRectangle,
in operation);
}
else
{
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
ParallelRowIterator.IterateRows(
this.Configuration,
sourceRectangle,
in operation);
}
}

/// <summary>
Expand Down Expand Up @@ -286,6 +306,56 @@ public void Invoke(int y, Span<Vector4> span)
}
}

/// <summary>
/// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
/// </summary>
private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation<Vector4>
{
private readonly Rectangle bounds;
private readonly Buffer2D<TPixel> targetPixels;
private readonly Configuration configuration;

[MethodImpl(InliningOptions.ShortMethod)]
public ApplyGamma3ExposureRowOperation(
Rectangle bounds,
Buffer2D<TPixel> targetPixels,
Configuration configuration)
{
this.bounds = bounds;
this.targetPixels = targetPixels;
this.configuration = configuration;
}

/// <inheritdoc/>
[MethodImpl(InliningOptions.ShortMethod)]
public void Invoke(int y, Span<Vector4> span)
{
Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);

for (int x = 0; x < this.bounds.Width; x++)
{
ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
Vector4 v = pixel4;
float a = v.W;

// Fast path for the default gamma exposure, which is 3. In this case we can skip
// calling Math.Pow 3 times (one per component), as the method is an internal call and
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
// back to the target index in the temporary span. The whole iteration will get completely
// inlined and traslated into vectorized instructions, with much better performance.
v = v * v * v;
v.W = a;

pixel4 = v;
}

PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
}
}

/// <summary>
/// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
/// </summary>
Expand Down Expand Up @@ -335,5 +405,82 @@ public void Invoke(int y)
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
}
}

/// <summary>
/// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
/// </summary>
private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
{
private readonly Rectangle bounds;
private readonly Buffer2D<TPixel> targetPixels;
private readonly Buffer2D<Vector4> sourceValues;
private readonly Configuration configuration;

[MethodImpl(InliningOptions.ShortMethod)]
public ApplyInverseGamma3ExposureRowOperation(
Rectangle bounds,
Buffer2D<TPixel> targetPixels,
Buffer2D<Vector4> sourceValues,
Configuration configuration)
{
this.bounds = bounds;
this.targetPixels = targetPixels;
this.sourceValues = sourceValues;
this.configuration = configuration;
}

/// <inheritdoc/>
[MethodImpl(InliningOptions.ShortMethod)]
public unsafe void Invoke(int y)
{
Vector4 low = Vector4.Zero;
var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity);

Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X);
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);

for (int x = 0; x < this.bounds.Width; x++)
{
ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
Vector4 clamp = Numerics.Clamp(v, low, high);

double
x64 = clamp.X,
y64 = clamp.Y,
z64 = clamp.Z;
float a = clamp.W;

ulong
xl = *(ulong*)&x64,
yl = *(ulong*)&y64,
zl = *(ulong*)&z64;

// Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
// of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
// constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
// the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
// these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
// As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
xl = 0x2a9f8a7be393b600 + (xl / 3);
yl = 0x2a9f8a7be393b600 + (yl / 3);
zl = 0x2a9f8a7be393b600 + (zl / 3);

Vector4 y4;
y4.X = (float)*(double*)&xl;
y4.Y = (float)*(double*)&yl;
y4.Z = (float)*(double*)&zl;
y4.W = 0;

y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
y4.W = a;

v = y4;
}

PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
}
}
}
}

0 comments on commit e4ba017

Please sign in to comment.