From c1e6d5071260d3d36fd97fd4461f220db86992c2 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 14:03:33 +0100 Subject: [PATCH 1/5] Add AVX2 implementation --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 2 +- .../ColorConverters/JpegColorConverter.cs | 130 ++++++++++++------ .../Codecs/Jpeg/Vector4OctetPack.cs | 40 ++++++ .../Config.HwIntrinsics.cs | 4 +- 4 files changed, 134 insertions(+), 42 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 2fe2f99ac6..a51c21b37f 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -14,7 +14,7 @@ internal static partial class SimdUtils { public static class HwIntrinsics { - private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; /// /// as many elements as possible, slicing them down (keeping the remainder). diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index f68bca0412..f2a1c1e91e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -4,7 +4,12 @@ using System; using System.Collections.Generic; using System.Numerics; - +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Tuples; @@ -190,45 +195,90 @@ internal struct Vector4Octet /// public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) { - this.V0.X = r.A.X; - this.V0.Y = g.A.X; - this.V0.Z = b.A.X; - this.V0.W = 1f; - - this.V1.X = r.A.Y; - this.V1.Y = g.A.Y; - this.V1.Z = b.A.Y; - this.V1.W = 1f; - - this.V2.X = r.A.Z; - this.V2.Y = g.A.Z; - this.V2.Z = b.A.Z; - this.V2.W = 1f; - - this.V3.X = r.A.W; - this.V3.Y = g.A.W; - this.V3.Z = b.A.W; - this.V3.W = 1f; - - this.V4.X = r.B.X; - this.V4.Y = g.B.X; - this.V4.Z = b.B.X; - this.V4.W = 1f; - - this.V5.X = r.B.Y; - this.V5.Y = g.B.Y; - this.V5.Z = b.B.Y; - this.V5.W = 1f; - - this.V6.X = r.B.Z; - this.V6.Y = g.B.Z; - this.V6.Z = b.B.Z; - this.V6.W = 1f; - - this.V7.X = r.B.W; - this.V7.Y = g.B.W; - this.V7.Z = b.B.W; - this.V7.W = 1f; +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + Vector4 vo = Vector4.One; + Vector128 valpha = Unsafe.As>(ref vo); + + ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32); + Vector256 vcontrol = Unsafe.As>(ref control); + + Vector256 r0 = Avx.InsertVector128( + Unsafe.As>(ref r.A).ToVector256(), + Unsafe.As>(ref g.A), + 1); + + Vector256 r1 = Avx.InsertVector128( + Unsafe.As>(ref b.A).ToVector256(), + valpha, + 1); + + Vector256 r2 = Avx.InsertVector128( + Unsafe.As>(ref r.B).ToVector256(), + Unsafe.As>(ref g.B), + 1); + + Vector256 r3 = Avx.InsertVector128( + Unsafe.As>(ref b.B).ToVector256(), + valpha, + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); + + Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); + + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); + + Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); + } + else +#endif + { + this.V0.X = r.A.X; + this.V0.Y = g.A.X; + this.V0.Z = b.A.X; + this.V0.W = 1f; + + this.V1.X = r.A.Y; + this.V1.Y = g.A.Y; + this.V1.Z = b.A.Y; + this.V1.W = 1f; + + this.V2.X = r.A.Z; + this.V2.Y = g.A.Z; + this.V2.Z = b.A.Z; + this.V2.W = 1f; + + this.V3.X = r.A.W; + this.V3.Y = g.A.W; + this.V3.Z = b.A.W; + this.V3.W = 1f; + + this.V4.X = r.B.X; + this.V4.Y = g.B.X; + this.V4.Z = b.B.X; + this.V4.W = 1f; + + this.V5.X = r.B.Y; + this.V5.Y = g.B.Y; + this.V5.Z = b.B.Y; + this.V5.W = 1f; + + this.V6.X = r.B.Z; + this.V6.Y = g.B.Z; + this.V6.Z = b.B.Z; + this.V6.W = 1f; + + this.V7.X = r.B.W; + this.V7.Y = g.B.W; + this.V7.Z = b.B.W; + this.V7.W = 1f; + } } } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs new file mode 100644 index 0000000000..a7ea771988 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs @@ -0,0 +1,40 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.Numerics; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Tuples; +using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter; + +namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Vector4OctetPack + { + private static Vector4Pair r = new Vector4Pair + { + A = new Vector4(1, 2, 3, 4), + B = new Vector4(5, 6, 7, 8) + }; + + private static Vector4Pair g = new Vector4Pair + { + A = new Vector4(9, 10, 11, 12), + B = new Vector4(13, 14, 15, 16) + }; + + private static Vector4Pair b = new Vector4Pair + { + A = new Vector4(17, 18, 19, 20), + B = new Vector4(21, 22, 23, 24) + }; + + [Benchmark] + public void Pack() + { + Vector4Octet v = default; + + v.Pack(ref r, ref g, ref b); + } + } +} diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index e860c5491f..e8a06bf24e 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -73,7 +73,9 @@ public HwIntrinsics_SSE_AVX() } #endif this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables(new EnvironmentVariable(EnableHWIntrinsic, Off)) + .WithEnvironmentVariables( + new EnvironmentVariable(EnableHWIntrinsic, Off), + new EnvironmentVariable(FeatureSIMD, Off)) .WithId("No HwIntrinsics")); } } From ebfd069591cef36303a6cd2c3979574a8eab1064 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 16:54:10 +0100 Subject: [PATCH 2/5] Use HW color conversion --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 23 +++ .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 74 ++++++++- .../ColorConverters/JpegColorConverter.cs | 156 +++++++++--------- .../Codecs/Jpeg/Vector4OctetPack.cs | 40 ----- 4 files changed, 174 insertions(+), 119 deletions(-) delete mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index a51c21b37f..c5a7f5e909 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -16,6 +16,29 @@ public static class HwIntrinsics { public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + /// + /// Performs a multiplication and an addition of the . + /// + /// The vector to add to the intermediate result. + /// The first vector to multiply. + /// The second vector to multiply. + /// The . + [MethodImpl(InliningOptions.ShortMethod)] + public static Vector256 MultiplyAdd( + in Vector256 va, + in Vector256 vm0, + in Vector256 vm1) + { + if (Fma.IsSupported) + { + return Fma.MultiplyAdd(vm1, vm0, va); + } + else + { + return Avx.Add(Avx.Multiply(vm0, vm1), va); + } + } + /// /// as many elements as possible, slicing them down (keeping the remainder). /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index c4d1408a2e..8c34baa1dc 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -1,11 +1,15 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; - +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using static SixLabors.ImageSharp.SimdUtils; +#endif using SixLabors.ImageSharp.Tuples; // ReSharper disable ImpureMethodCallOnReadonlyValueField @@ -47,6 +51,71 @@ internal static void ConvertCore(in ComponentValues values, Span result "JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!"); } +#if SUPPORTS_RUNTIME_INTRINSICS + ref Vector256 yBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector256 cbBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector256 crBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + + ref Vector4Octet resultBase = + ref Unsafe.As(ref MemoryMarshal.GetReference(result)); + + // Used for the color conversion + var chromaOffset = Vector256.Create(-halfValue); + var scale = Vector256.Create(1 / maxValue); + var rCrMult = Vector256.Create(1.402F); + var gCbMult = Vector256.Create(0.344136F); + var gCrMult = Vector256.Create(0.714136F); + var bCbMult = Vector256.Create(1.772F); + + // Used for packing. + Vector4 vo = Vector4.One; + Vector128 valpha = Unsafe.As>(ref vo); + ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32); + Vector256 vcontrol = Unsafe.As>(ref control); + + Vector4Pair rr = default; + Vector4Pair gg = default; + Vector4Pair bb = default; + + ref Vector256 rrRefAsVector = ref Unsafe.As>(ref rr); + ref Vector256 ggRefAsVector = ref Unsafe.As>(ref gg); + ref Vector256 bbRefAsVector = ref Unsafe.As>(ref bb); + + // Walking 8 elements at one step: + int n = result.Length / 8; + for (int i = 0; i < n; i++) + { + // y = yVals[i]; + // cb = cbVals[i] - 128F; + // cr = crVals[i] - 128F; + Vector256 y = Unsafe.Add(ref yBase, i); + Vector256 cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); + Vector256 cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); + + // r = y + (1.402F * cr); + // g = y - (0.344136F * cb) - (0.714136F * cr); + // b = y + (1.772F * cb); + // Adding & multiplying 8 elements at one time: + Vector256 r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); + Vector256 g = Avx.Subtract(Avx.Subtract(y, Avx.Multiply(cb, gCbMult)), Avx.Multiply(cr, gCrMult)); + Vector256 b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); + + r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale); + g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); + b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); + + rrRefAsVector = r; + ggRefAsVector = g; + bbRefAsVector = b; + + // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: + ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); + destination.PackAvx2(ref rr, ref gg, ref bb, in valpha, in vcontrol); + } +#else ref Vector yBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); ref Vector cbBase = @@ -104,6 +173,7 @@ internal static void ConvertCore(in ComponentValues values, Span result ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); destination.Pack(ref rr, ref gg, ref bb); } +#endif } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index f2a1c1e91e..4e96f3471d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -190,95 +190,97 @@ internal struct Vector4Octet #pragma warning disable SA1132 // Do not combine fields public Vector4 V0, V1, V2, V3, V4, V5, V6, V7; +#if SUPPORTS_RUNTIME_INTRINSICS + /// /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... /// - public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) + [MethodImpl(InliningOptions.ShortMethod)] + public void PackAvx2( + ref Vector4Pair r, + ref Vector4Pair g, + ref Vector4Pair b, + in Vector128 a, + in Vector256 vcontrol) { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - Vector4 vo = Vector4.One; - Vector128 valpha = Unsafe.As>(ref vo); + Vector256 r0 = Avx.InsertVector128( + Unsafe.As>(ref r.A), + Unsafe.As>(ref g.A), + 1); - ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32); - Vector256 vcontrol = Unsafe.As>(ref control); + Vector256 r1 = Avx.InsertVector128( + Unsafe.As>(ref b.A), + a, + 1); - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref r.A).ToVector256(), - Unsafe.As>(ref g.A), - 1); + Vector256 r2 = Avx.InsertVector128( + Unsafe.As>(ref r.B).ToVector256(), + Unsafe.As>(ref g.B), + 1); - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref b.A).ToVector256(), - valpha, - 1); + Vector256 r3 = Avx.InsertVector128( + Unsafe.As>(ref b.B).ToVector256(), + a, + 1); - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref r.B).ToVector256(), - Unsafe.As>(ref g.B), - 1); + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref b.B).ToVector256(), - valpha, - 1); + Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackHigh(r0, r1); + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); - Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); - Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - - Vector256 t4 = Avx.UnpackLow(r2, r3); - Vector256 t6 = Avx.UnpackHigh(r2, r3); - - Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); - Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); - } - else + Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); + } #endif - { - this.V0.X = r.A.X; - this.V0.Y = g.A.X; - this.V0.Z = b.A.X; - this.V0.W = 1f; - - this.V1.X = r.A.Y; - this.V1.Y = g.A.Y; - this.V1.Z = b.A.Y; - this.V1.W = 1f; - - this.V2.X = r.A.Z; - this.V2.Y = g.A.Z; - this.V2.Z = b.A.Z; - this.V2.W = 1f; - - this.V3.X = r.A.W; - this.V3.Y = g.A.W; - this.V3.Z = b.A.W; - this.V3.W = 1f; - - this.V4.X = r.B.X; - this.V4.Y = g.B.X; - this.V4.Z = b.B.X; - this.V4.W = 1f; - - this.V5.X = r.B.Y; - this.V5.Y = g.B.Y; - this.V5.Z = b.B.Y; - this.V5.W = 1f; - - this.V6.X = r.B.Z; - this.V6.Y = g.B.Z; - this.V6.Z = b.B.Z; - this.V6.W = 1f; - - this.V7.X = r.B.W; - this.V7.Y = g.B.W; - this.V7.Z = b.B.W; - this.V7.W = 1f; - } + + /// + /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... + /// + public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) + { + this.V0.X = r.A.X; + this.V0.Y = g.A.X; + this.V0.Z = b.A.X; + this.V0.W = 1f; + + this.V1.X = r.A.Y; + this.V1.Y = g.A.Y; + this.V1.Z = b.A.Y; + this.V1.W = 1f; + + this.V2.X = r.A.Z; + this.V2.Y = g.A.Z; + this.V2.Z = b.A.Z; + this.V2.W = 1f; + + this.V3.X = r.A.W; + this.V3.Y = g.A.W; + this.V3.Z = b.A.W; + this.V3.W = 1f; + + this.V4.X = r.B.X; + this.V4.Y = g.B.X; + this.V4.Z = b.B.X; + this.V4.W = 1f; + + this.V5.X = r.B.Y; + this.V5.Y = g.B.Y; + this.V5.Z = b.B.Y; + this.V5.W = 1f; + + this.V6.X = r.B.Z; + this.V6.Y = g.B.Z; + this.V6.Z = b.B.Z; + this.V6.W = 1f; + + this.V7.X = r.B.W; + this.V7.Y = g.B.W; + this.V7.Z = b.B.W; + this.V7.W = 1f; } } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs deleted file mode 100644 index a7ea771988..0000000000 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - -using System.Numerics; -using BenchmarkDotNet.Attributes; -using SixLabors.ImageSharp.Tuples; -using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter; - -namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg -{ - [Config(typeof(Config.HwIntrinsics_SSE_AVX))] - public class Vector4OctetPack - { - private static Vector4Pair r = new Vector4Pair - { - A = new Vector4(1, 2, 3, 4), - B = new Vector4(5, 6, 7, 8) - }; - - private static Vector4Pair g = new Vector4Pair - { - A = new Vector4(9, 10, 11, 12), - B = new Vector4(13, 14, 15, 16) - }; - - private static Vector4Pair b = new Vector4Pair - { - A = new Vector4(17, 18, 19, 20), - B = new Vector4(21, 22, 23, 24) - }; - - [Benchmark] - public void Pack() - { - Vector4Octet v = default; - - v.Pack(ref r, ref g, ref b); - } - } -} From 8872b2b949498bf272f63ee81fdaab59373744ba Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 18:09:06 +0100 Subject: [PATCH 3/5] Fix access violation --- .../Components/Decoder/ColorConverters/JpegColorConverter.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index 4e96f3471d..b40d9b9e6e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -204,12 +204,12 @@ public void PackAvx2( in Vector256 vcontrol) { Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref r.A), + Unsafe.As>(ref r.A).ToVector256(), Unsafe.As>(ref g.A), 1); Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref b.A), + Unsafe.As>(ref b.A).ToVector256(), a, 1); From eb315fe83d5ef9b613ad3fa118ba3ab3cefab010 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 18:46:00 +0100 Subject: [PATCH 4/5] Inline the packing. --- .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 59 +++++++++++++------ .../ColorConverters/JpegColorConverter.cs | 53 ----------------- 2 files changed, 42 insertions(+), 70 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index 8c34baa1dc..ca7971a074 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -59,8 +59,8 @@ internal static void ConvertCore(in ComponentValues values, Span result ref Vector256 crBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); - ref Vector4Octet resultBase = - ref Unsafe.As(ref MemoryMarshal.GetReference(result)); + ref Vector256 resultBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(result)); // Used for the color conversion var chromaOffset = Vector256.Create(-halfValue); @@ -76,14 +76,6 @@ internal static void ConvertCore(in ComponentValues values, Span result ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32); Vector256 vcontrol = Unsafe.As>(ref control); - Vector4Pair rr = default; - Vector4Pair gg = default; - Vector4Pair bb = default; - - ref Vector256 rrRefAsVector = ref Unsafe.As>(ref rr); - ref Vector256 ggRefAsVector = ref Unsafe.As>(ref gg); - ref Vector256 bbRefAsVector = ref Unsafe.As>(ref bb); - // Walking 8 elements at one step: int n = result.Length / 8; for (int i = 0; i < n; i++) @@ -107,13 +99,46 @@ internal static void ConvertCore(in ComponentValues values, Span result g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); - rrRefAsVector = r; - ggRefAsVector = g; - bbRefAsVector = b; - - // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: - ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); - destination.PackAvx2(ref rr, ref gg, ref bb, in valpha, in vcontrol); + // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the + // expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: + // + // Left side. + Vector256 r0 = Avx.InsertVector128( + r, + Unsafe.As, Vector128>(ref g), + 1); + + Vector256 r1 = Avx.InsertVector128( + b, + valpha, + 1); + + // Right side + Vector256 r2 = Avx.InsertVector128( + Unsafe.Add(ref Unsafe.As, Vector128>(ref r), 1).ToVector256(), + Unsafe.Add(ref Unsafe.As, Vector128>(ref g), 1), + 1); + + Vector256 r3 = Avx.InsertVector128( + Unsafe.Add(ref Unsafe.As, Vector128>(ref b), 1).ToVector256(), + valpha, + 1); + + // Split into separate rows + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); + + // Deinterleave and set + ref Vector256 destination = ref Unsafe.Add(ref resultBase, i * 4); + destination = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.Add(ref destination, 1) = Avx2.PermuteVar8x32(t2, vcontrol); + + // Repeat for right side. + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); + + Unsafe.Add(ref destination, 2) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.Add(ref destination, 3) = Avx2.PermuteVar8x32(t6, vcontrol); } #else ref Vector yBase = diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index b40d9b9e6e..7c780700c9 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -4,12 +4,6 @@ using System; using System.Collections.Generic; using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -#if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Tuples; @@ -190,53 +184,6 @@ internal struct Vector4Octet #pragma warning disable SA1132 // Do not combine fields public Vector4 V0, V1, V2, V3, V4, V5, V6, V7; -#if SUPPORTS_RUNTIME_INTRINSICS - - /// - /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... - /// - [MethodImpl(InliningOptions.ShortMethod)] - public void PackAvx2( - ref Vector4Pair r, - ref Vector4Pair g, - ref Vector4Pair b, - in Vector128 a, - in Vector256 vcontrol) - { - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref r.A).ToVector256(), - Unsafe.As>(ref g.A), - 1); - - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref b.A).ToVector256(), - a, - 1); - - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref r.B).ToVector256(), - Unsafe.As>(ref g.B), - 1); - - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref b.B).ToVector256(), - a, - 1); - - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackHigh(r0, r1); - - Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); - Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - - Vector256 t4 = Avx.UnpackLow(r2, r3); - Vector256 t6 = Avx.UnpackHigh(r2, r3); - - Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); - Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); - } -#endif - /// /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... /// From 3ae4b024a0a979e46d38c92557c32e431c5ed05a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 24 Oct 2020 00:55:22 +0100 Subject: [PATCH 5/5] Use less permutes and more multiply/add --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 2 + .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 63 ++++++------------- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index c5a7f5e909..2d788992ee 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -16,6 +16,8 @@ public static class HwIntrinsics { public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index ca7971a074..1319b56ee0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -66,14 +66,13 @@ internal static void ConvertCore(in ComponentValues values, Span result var chromaOffset = Vector256.Create(-halfValue); var scale = Vector256.Create(1 / maxValue); var rCrMult = Vector256.Create(1.402F); - var gCbMult = Vector256.Create(0.344136F); - var gCrMult = Vector256.Create(0.714136F); + var gCbMult = Vector256.Create(-0.344136F); + var gCrMult = Vector256.Create(-0.714136F); var bCbMult = Vector256.Create(1.772F); // Used for packing. - Vector4 vo = Vector4.One; - Vector128 valpha = Unsafe.As>(ref vo); - ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32); + var va = Vector256.Create(1F); + ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); Vector256 vcontrol = Unsafe.As>(ref control); // Walking 8 elements at one step: @@ -87,58 +86,36 @@ internal static void ConvertCore(in ComponentValues values, Span result Vector256 cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); Vector256 cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); + y = Avx2.PermuteVar8x32(y, vcontrol); + cb = Avx2.PermuteVar8x32(cb, vcontrol); + cr = Avx2.PermuteVar8x32(cr, vcontrol); + // r = y + (1.402F * cr); // g = y - (0.344136F * cb) - (0.714136F * cr); // b = y + (1.772F * cb); // Adding & multiplying 8 elements at one time: Vector256 r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); - Vector256 g = Avx.Subtract(Avx.Subtract(y, Avx.Multiply(cb, gCbMult)), Avx.Multiply(cr, gCrMult)); + Vector256 g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult); Vector256 b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); + // TODO: We should be savving to RGBA not Vector4 r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale); g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); - // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the - // expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: - // - // Left side. - Vector256 r0 = Avx.InsertVector128( - r, - Unsafe.As, Vector128>(ref g), - 1); - - Vector256 r1 = Avx.InsertVector128( - b, - valpha, - 1); - - // Right side - Vector256 r2 = Avx.InsertVector128( - Unsafe.Add(ref Unsafe.As, Vector128>(ref r), 1).ToVector256(), - Unsafe.Add(ref Unsafe.As, Vector128>(ref g), 1), - 1); - - Vector256 r3 = Avx.InsertVector128( - Unsafe.Add(ref Unsafe.As, Vector128>(ref b), 1).ToVector256(), - valpha, - 1); - - // Split into separate rows - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackHigh(r0, r1); - - // Deinterleave and set + Vector256 vte = Avx.UnpackLow(r, b); + Vector256 vto = Avx.UnpackLow(g, va); + ref Vector256 destination = ref Unsafe.Add(ref resultBase, i * 4); - destination = Avx2.PermuteVar8x32(t0, vcontrol); - Unsafe.Add(ref destination, 1) = Avx2.PermuteVar8x32(t2, vcontrol); - // Repeat for right side. - Vector256 t4 = Avx.UnpackLow(r2, r3); - Vector256 t6 = Avx.UnpackHigh(r2, r3); + destination = Avx.UnpackLow(vte, vto); + Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto); + + vte = Avx.UnpackHigh(r, b); + vto = Avx.UnpackHigh(g, va); - Unsafe.Add(ref destination, 2) = Avx2.PermuteVar8x32(t4, vcontrol); - Unsafe.Add(ref destination, 3) = Avx2.PermuteVar8x32(t6, vcontrol); + Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto); + Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto); } #else ref Vector yBase =