From 151bacc0203f097f7aa09d6f3adb52cd97171a78 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sat, 23 Oct 2021 19:22:57 +0200
Subject: [PATCH 01/85] Use Convert.To after rounding to avoid different
 behavior on ARM vs x86/x64

---
 .../PixelImplementations/NormalizedByte2.cs   | 12 ++++++----
 .../PixelImplementations/NormalizedByte4.cs   | 20 ++++++++--------
 .../PixelImplementations/NormalizedShort2.cs  | 11 +++++----
 .../PixelImplementations/NormalizedShort4.cs  | 23 ++++++++++---------
 .../PixelImplementations/Short2.cs            |  4 ++--
 .../PixelImplementations/Short4.cs            |  8 +++----
 6 files changed, 42 insertions(+), 36 deletions(-)
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs
index 8b244d391c..720a1eef65 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs
@@ -15,7 +15,9 @@ namespace SixLabors.ImageSharp.PixelFormats
     /// </summary>
     public partial struct NormalizedByte2 : IPixel<NormalizedByte2>, IPackedVector<ushort>
     {
-        private static readonly Vector2 Half = new Vector2(127);
+        private const float MaxPos = 127F;
+
+        private static readonly Vector2 Half = new Vector2(MaxPos);
         private static readonly Vector2 MinusOne = new Vector2(-1F);
 
         /// <summary>
@@ -154,8 +156,8 @@ public void FromVector4(Vector4 vector)
         public readonly Vector2 ToVector2()
         {
             return new Vector2(
-                (sbyte)((this.PackedValue >> 0) & 0xFF) / 127F,
-                (sbyte)((this.PackedValue >> 8) & 0xFF) / 127F);
+                (sbyte)((this.PackedValue >> 0) & 0xFF) / MaxPos,
+                (sbyte)((this.PackedValue >> 8) & 0xFF) / MaxPos);
         }
 
         /// <inheritdoc />
@@ -181,8 +183,8 @@ private static ushort Pack(Vector2 vector)
         {
             vector = Vector2.Clamp(vector, MinusOne, Vector2.One) * Half;
 
-            int byte2 = ((ushort)Math.Round(vector.X) & 0xFF) << 0;
-            int byte1 = ((ushort)Math.Round(vector.Y) & 0xFF) << 8;
+            int byte2 = ((ushort)Convert.ToInt16(Math.Round(vector.X)) & 0xFF) << 0;
+            int byte1 = ((ushort)Convert.ToInt16(Math.Round(vector.Y)) & 0xFF) << 8;
 
             return (ushort)(byte2 | byte1);
         }
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs
index 84f0bb0224..d1b4b73f2b 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs
@@ -15,7 +15,9 @@ namespace SixLabors.ImageSharp.PixelFormats
     /// </summary>
     public partial struct NormalizedByte4 : IPixel<NormalizedByte4>, IPackedVector<uint>
     {
-        private static readonly Vector4 Half = new Vector4(127);
+        private const float MaxPos = 127F;
+
+        private static readonly Vector4 Half = new Vector4(MaxPos);
         private static readonly Vector4 MinusOne = new Vector4(-1F);
 
         /// <summary>
@@ -92,10 +94,10 @@ public readonly Vector4 ToScaledVector4()
         public readonly Vector4 ToVector4()
         {
             return new Vector4(
-                (sbyte)((this.PackedValue >> 0) & 0xFF) / 127F,
-                (sbyte)((this.PackedValue >> 8) & 0xFF) / 127F,
-                (sbyte)((this.PackedValue >> 16) & 0xFF) / 127F,
-                (sbyte)((this.PackedValue >> 24) & 0xFF) / 127F);
+                (sbyte)((this.PackedValue >> 0) & 0xFF) / MaxPos,
+                (sbyte)((this.PackedValue >> 8) & 0xFF) / MaxPos,
+                (sbyte)((this.PackedValue >> 16) & 0xFF) / MaxPos,
+                (sbyte)((this.PackedValue >> 24) & 0xFF) / MaxPos);
         }
 
         /// <inheritdoc />
@@ -176,10 +178,10 @@ private static uint Pack(ref Vector4 vector)
         {
             vector = Numerics.Clamp(vector, MinusOne, Vector4.One) * Half;
 
-            uint byte4 = ((uint)MathF.Round(vector.X) & 0xFF) << 0;
-            uint byte3 = ((uint)MathF.Round(vector.Y) & 0xFF) << 8;
-            uint byte2 = ((uint)MathF.Round(vector.Z) & 0xFF) << 16;
-            uint byte1 = ((uint)MathF.Round(vector.W) & 0xFF) << 24;
+            uint byte4 = ((uint)Convert.ToInt16(MathF.Round(vector.X)) & 0xFF) << 0;
+            uint byte3 = ((uint)Convert.ToInt16(MathF.Round(vector.Y)) & 0xFF) << 8;
+            uint byte2 = ((uint)Convert.ToInt16(MathF.Round(vector.Z)) & 0xFF) << 16;
+            uint byte1 = ((uint)Convert.ToInt16(MathF.Round(vector.W)) & 0xFF) << 24;
 
             return byte4 | byte3 | byte2 | byte1;
         }
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs
index 97bbc1206f..d08a546031 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs
@@ -15,7 +15,10 @@ namespace SixLabors.ImageSharp.PixelFormats
     /// </summary>
     public partial struct NormalizedShort2 : IPixel<NormalizedShort2>, IPackedVector<uint>
     {
-        private static readonly Vector2 Max = new Vector2(0x7FFF);
+        // Largest two byte positive number 0xFFFF >> 1;
+        private const float MaxPos = 0x7FFF;
+
+        private static readonly Vector2 Max = new Vector2(MaxPos);
         private static readonly Vector2 Min = Vector2.Negate(Max);
 
         /// <summary>
@@ -156,11 +159,9 @@ public void ToRgba32(ref Rgba32 dest)
         [MethodImpl(InliningOptions.ShortMethod)]
         public readonly Vector2 ToVector2()
         {
-            const float MaxVal = 0x7FFF;
-
             return new Vector2(
-                (short)(this.PackedValue & 0xFFFF) / MaxVal,
-                (short)(this.PackedValue >> 0x10) / MaxVal);
+                (short)(this.PackedValue & 0xFFFF) / MaxPos,
+                (short)(this.PackedValue >> 0x10) / MaxPos);
         }
 
         /// <inheritdoc />
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs
index a3fd8989ce..158b6eb4b0 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs
@@ -15,7 +15,10 @@ namespace SixLabors.ImageSharp.PixelFormats
     /// </summary>
     public partial struct NormalizedShort4 : IPixel<NormalizedShort4>, IPackedVector<ulong>
     {
-        private static readonly Vector4 Max = new Vector4(0x7FFF);
+        // Largest two byte positive number 0xFFFF >> 1;
+        private const float MaxPos = 0x7FFF;
+
+        private static readonly Vector4 Max = new Vector4(MaxPos);
         private static readonly Vector4 Min = Vector4.Negate(Max);
 
         /// <summary>
@@ -91,13 +94,11 @@ public readonly Vector4 ToScaledVector4()
         [MethodImpl(InliningOptions.ShortMethod)]
         public readonly Vector4 ToVector4()
         {
-            const float MaxVal = 0x7FFF;
-
             return new Vector4(
-                         (short)((this.PackedValue >> 0x00) & 0xFFFF) / MaxVal,
-                         (short)((this.PackedValue >> 0x10) & 0xFFFF) / MaxVal,
-                         (short)((this.PackedValue >> 0x20) & 0xFFFF) / MaxVal,
-                         (short)((this.PackedValue >> 0x30) & 0xFFFF) / MaxVal);
+                         (short)((this.PackedValue >> 0x00) & 0xFFFF) / MaxPos,
+                         (short)((this.PackedValue >> 0x10) & 0xFFFF) / MaxPos,
+                         (short)((this.PackedValue >> 0x20) & 0xFFFF) / MaxPos,
+                         (short)((this.PackedValue >> 0x30) & 0xFFFF) / MaxPos);
         }
 
         /// <inheritdoc />
@@ -180,10 +181,10 @@ private static ulong Pack(ref Vector4 vector)
             vector = Numerics.Clamp(vector, Min, Max);
 
             // Round rather than truncate.
-            ulong word4 = ((ulong)MathF.Round(vector.X) & 0xFFFF) << 0x00;
-            ulong word3 = ((ulong)MathF.Round(vector.Y) & 0xFFFF) << 0x10;
-            ulong word2 = ((ulong)MathF.Round(vector.Z) & 0xFFFF) << 0x20;
-            ulong word1 = ((ulong)MathF.Round(vector.W) & 0xFFFF) << 0x30;
+            ulong word4 = ((ulong)Convert.ToInt32(MathF.Round(vector.X)) & 0xFFFF) << 0x00;
+            ulong word3 = ((ulong)Convert.ToInt32(MathF.Round(vector.Y)) & 0xFFFF) << 0x10;
+            ulong word2 = ((ulong)Convert.ToInt32(MathF.Round(vector.Z)) & 0xFFFF) << 0x20;
+            ulong word1 = ((ulong)Convert.ToInt32(MathF.Round(vector.W)) & 0xFFFF) << 0x30;
 
             return word4 | word3 | word2 | word1;
         }
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs
index f7a4f99945..101027a78e 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs
@@ -181,8 +181,8 @@ public override readonly string ToString()
         private static uint Pack(Vector2 vector)
         {
             vector = Vector2.Clamp(vector, Min, Max);
-            uint word2 = (uint)Math.Round(vector.X) & 0xFFFF;
-            uint word1 = ((uint)Math.Round(vector.Y) & 0xFFFF) << 0x10;
+            uint word2 = (uint)Convert.ToInt32(Math.Round(vector.X)) & 0xFFFF;
+            uint word1 = ((uint)Convert.ToInt32(Math.Round(vector.Y)) & 0xFFFF) << 0x10;
 
             return word2 | word1;
         }
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Short4.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Short4.cs
index 409f46c721..86a519297b 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Short4.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Short4.cs
@@ -186,10 +186,10 @@ private static ulong Pack(ref Vector4 vector)
             vector = Numerics.Clamp(vector, Min, Max);
 
             // Clamp the value between min and max values
-            ulong word4 = ((ulong)Math.Round(vector.X) & 0xFFFF) << 0x00;
-            ulong word3 = ((ulong)Math.Round(vector.Y) & 0xFFFF) << 0x10;
-            ulong word2 = ((ulong)Math.Round(vector.Z) & 0xFFFF) << 0x20;
-            ulong word1 = ((ulong)Math.Round(vector.W) & 0xFFFF) << 0x30;
+            ulong word4 = ((ulong)Convert.ToInt32(Math.Round(vector.X)) & 0xFFFF) << 0x00;
+            ulong word3 = ((ulong)Convert.ToInt32(Math.Round(vector.Y)) & 0xFFFF) << 0x10;
+            ulong word2 = ((ulong)Convert.ToInt32(Math.Round(vector.Z)) & 0xFFFF) << 0x20;
+            ulong word1 = ((ulong)Convert.ToInt32(Math.Round(vector.W)) & 0xFFFF) << 0x30;
 
             return word4 | word3 | word2 | word1;
         }

From 49e57722b815ec550e15cd41fe4e3202abe5287c Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sat, 23 Oct 2021 20:05:25 +0200
Subject: [PATCH 02/85] Cleanup

---
 .../PixelFormats/PixelImplementations/A8.cs   |  2 +-
 .../PixelImplementations/Argb32.cs            |  6 +--
 .../PixelImplementations/Bgr24.cs             |  4 +-
 .../PixelImplementations/Bgr565.cs            | 12 ++----
 .../PixelImplementations/Bgra32.cs            |  6 +--
 .../PixelImplementations/Bgra4444.cs          |  5 +--
 .../PixelImplementations/Bgra5551.cs          | 10 +----
 .../PixelImplementations/Byte4.cs             | 10 +----
 .../PixelImplementations/HalfSingle.cs        |  7 +---
 .../PixelImplementations/HalfVector2.cs       |  5 +--
 .../PixelImplementations/HalfVector4.cs       | 10 +----
 .../PixelFormats/PixelImplementations/L16.cs  | 25 +++--------
 .../PixelFormats/PixelImplementations/L8.cs   |  4 +-
 .../PixelFormats/PixelImplementations/La16.cs |  6 +--
 .../PixelFormats/PixelImplementations/La32.cs |  2 +-
 .../PixelImplementations/NormalizedByte2.cs   | 11 ++---
 .../PixelImplementations/NormalizedByte4.cs   | 14 ++-----
 .../PixelImplementations/NormalizedShort2.cs  | 12 ++----
 .../PixelImplementations/NormalizedShort4.cs  | 12 ++----
 .../PixelFormats/PixelImplementations/Rg32.cs |  9 ++--
 .../PixelImplementations/Rgb24.cs             |  4 +-
 .../PixelImplementations/Rgb48.cs             |  2 +-
 .../PixelImplementations/Rgba1010102.cs       | 12 ++----
 .../PixelImplementations/Rgba32.cs            | 19 ++++-----
 .../PixelImplementations/Rgba64.cs            |  2 +-
 .../PixelImplementations/RgbaVector.cs        | 13 +++---
 .../PixelImplementations/Short2.cs            |  8 ++--
 .../PixelFormats/Bgr24Tests.cs                |  3 +-
 .../PixelFormats/Bgra32Tests.cs               | 11 +++--
 .../ImageSharp.Tests/PixelFormats/L8Tests.cs  | 24 +----------
 .../PixelFormats/La16Tests.cs                 | 24 +----------
 .../PixelFormats/PixelBlenderTests.cs         |  6 +--
 .../PixelFormats/PixelConverterTests.cs       | 42 +++++++++----------
 .../PixelFormats/Rgb24Tests.cs                |  4 +-
 .../PixelFormats/UnPackedPixelTests.cs        |  2 +-
 35 files changed, 111 insertions(+), 237 deletions(-)

diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/A8.cs b/src/ImageSharp/PixelFormats/PixelImplementations/A8.cs
index 77df2bc800..cca7ff7db9 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/A8.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/A8.cs
@@ -73,7 +73,7 @@ public partial struct A8 : IPixel<A8>, IPackedVector<byte>
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4(0, 0, 0, this.PackedValue / 255F);
+        public readonly Vector4 ToVector4() => new(0, 0, 0, this.PackedValue / 255F);
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Argb32.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Argb32.cs
index 3ac9b523f3..8c1b04ff1f 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Argb32.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Argb32.cs
@@ -44,12 +44,12 @@ public partial struct Argb32 : IPixel<Argb32>, IPackedVector<uint>
         /// <summary>
         /// The maximum byte value.
         /// </summary>
-        private static readonly Vector4 MaxBytes = new Vector4(255);
+        private static readonly Vector4 MaxBytes = new(255);
 
         /// <summary>
         /// The half vector value.
         /// </summary>
-        private static readonly Vector4 Half = new Vector4(0.5F);
+        private static readonly Vector4 Half = new(0.5F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Argb32"/> struct.
@@ -151,7 +151,7 @@ public uint PackedValue
         /// <param name="source">The <see cref="Argb32"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static implicit operator Color(Argb32 source) => new Color(source);
+        public static implicit operator Color(Argb32 source) => new(source);
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Argb32"/>.
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Bgr24.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Bgr24.cs
index 6cff5fd772..22e983a654 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Bgr24.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Bgr24.cs
@@ -56,7 +56,7 @@ public Bgr24(byte r, byte g, byte b)
         /// <param name="source">The <see cref="Bgr24"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static implicit operator Color(Bgr24 source) => new Color(source);
+        public static implicit operator Color(Bgr24 source) => new(source);
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Bgr24"/>.
@@ -225,7 +225,7 @@ public void FromRgba64(Rgba64 source)
         public override readonly bool Equals(object obj) => obj is Bgr24 other && this.Equals(other);
 
         /// <inheritdoc />
-        public override readonly string ToString() => $"Bgra({this.B}, {this.G}, {this.R})";
+        public override readonly string ToString() => $"Bgr24({this.B}, {this.G}, {this.R})";
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Bgr565.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Bgr565.cs
index fd12b68376..5585310b91 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Bgr565.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Bgr565.cs
@@ -81,7 +81,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4(this.ToVector3(), 1F);
+        public readonly Vector4 ToVector4() => new(this.ToVector3(), 1F);
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -125,10 +125,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -144,13 +141,10 @@ public void ToRgba32(ref Rgba32 dest)
         /// </summary>
         /// <returns>The <see cref="Vector3"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector3 ToVector3()
-        {
-            return new Vector3(
+        public readonly Vector3 ToVector3() => new(
                        ((this.PackedValue >> 11) & 0x1F) * (1F / 31F),
                        ((this.PackedValue >> 5) & 0x3F) * (1F / 63F),
                        (this.PackedValue & 0x1F) * (1F / 31F));
-        }
 
         /// <inheritdoc />
         public override readonly bool Equals(object obj) => obj is Bgr565 other && this.Equals(other);
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Bgra32.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Bgra32.cs
index 190345ddaf..be4e178c24 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Bgra32.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Bgra32.cs
@@ -41,12 +41,12 @@ public partial struct Bgra32 : IPixel<Bgra32>, IPackedVector<uint>
         /// <summary>
         /// The maximum byte value.
         /// </summary>
-        private static readonly Vector4 MaxBytes = new Vector4(255);
+        private static readonly Vector4 MaxBytes = new(255);
 
         /// <summary>
         /// The half vector value.
         /// </summary>
-        private static readonly Vector4 Half = new Vector4(0.5F);
+        private static readonly Vector4 Half = new(0.5F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Bgra32"/> struct.
@@ -104,7 +104,7 @@ public uint PackedValue
         /// <param name="source">The <see cref="Bgra32"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static implicit operator Color(Bgra32 source) => new Color(source);
+        public static implicit operator Color(Bgra32 source) => new(source);
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Bgra32"/>.
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Bgra4444.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Bgra4444.cs
index 8fa5219d53..3578f1dd38 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Bgra4444.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Bgra4444.cs
@@ -128,10 +128,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Bgra5551.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Bgra5551.cs
index b3a0d08960..0254397c3f 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Bgra5551.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Bgra5551.cs
@@ -78,14 +78,11 @@ public Bgra5551(float x, float y, float z, float w)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4()
-        {
-            return new Vector4(
+        public readonly Vector4 ToVector4() => new(
                         ((this.PackedValue >> 10) & 0x1F) / 31F,
                         ((this.PackedValue >> 5) & 0x1F) / 31F,
                         ((this.PackedValue >> 0) & 0x1F) / 31F,
                         (this.PackedValue >> 15) & 0x01);
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -129,10 +126,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Byte4.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Byte4.cs
index e261212918..0995f8417f 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Byte4.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Byte4.cs
@@ -78,14 +78,11 @@ public Byte4(float x, float y, float z, float w)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4()
-        {
-            return new Vector4(
+        public readonly Vector4 ToVector4() => new(
                 this.PackedValue & 0xFF,
                 (this.PackedValue >> 0x8) & 0xFF,
                 (this.PackedValue >> 0x10) & 0xFF,
                 (this.PackedValue >> 0x18) & 0xFF);
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -129,10 +126,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/HalfSingle.cs b/src/ImageSharp/PixelFormats/PixelImplementations/HalfSingle.cs
index 5c4aa1cfb6..b0ef0f6a9b 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/HalfSingle.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/HalfSingle.cs
@@ -74,7 +74,7 @@ public readonly Vector4 ToScaledVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4(this.ToSingle(), 0, 0, 1F);
+        public readonly Vector4 ToVector4() => new(this.ToSingle(), 0, 0, 1F);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -118,10 +118,7 @@ public readonly Vector4 ToScaledVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector2.cs b/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector2.cs
index 39cb6f7993..8be8261302 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector2.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector2.cs
@@ -129,10 +129,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector4.cs b/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector4.cs
index 9826d61a2b..955b274acb 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector4.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/HalfVector4.cs
@@ -86,14 +86,11 @@ public readonly Vector4 ToScaledVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4()
-        {
-            return new Vector4(
+        public readonly Vector4 ToVector4() => new(
                 HalfTypeHelper.Unpack((ushort)this.PackedValue),
                 HalfTypeHelper.Unpack((ushort)(this.PackedValue >> 0x10)),
                 HalfTypeHelper.Unpack((ushort)(this.PackedValue >> 0x20)),
                 HalfTypeHelper.Unpack((ushort)(this.PackedValue >> 0x30)));
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -137,10 +134,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/L16.cs b/src/ImageSharp/PixelFormats/PixelImplementations/L16.cs
index dd31aae2fc..6d1128dd2c 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/L16.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/L16.cs
@@ -72,33 +72,24 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void FromArgb32(Argb32 source)
-        {
-            this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
+        public void FromArgb32(Argb32 source) => this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.R),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.G),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.B));
-        }
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void FromBgr24(Bgr24 source)
-        {
-            this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
+        public void FromBgr24(Bgr24 source) => this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.R),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.G),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.B));
-        }
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void FromBgra32(Bgra32 source)
-        {
-            this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
+        public void FromBgra32(Bgra32 source) => this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.R),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.G),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.B));
-        }
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -122,23 +113,17 @@ public void FromBgra32(Bgra32 source)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void FromRgb24(Rgb24 source)
-        {
-            this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
+        public void FromRgb24(Rgb24 source) => this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.R),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.G),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.B));
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void FromRgba32(Rgba32 source)
-        {
-            this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
+        public void FromRgba32(Rgba32 source) => this.PackedValue = ColorNumerics.Get16BitBT709Luminance(
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.R),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.G),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(source.B));
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/L8.cs b/src/ImageSharp/PixelFormats/PixelImplementations/L8.cs
index c570c33a19..ffff60be52 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/L8.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/L8.cs
@@ -14,8 +14,8 @@ namespace SixLabors.ImageSharp.PixelFormats
     /// </summary>
     public partial struct L8 : IPixel<L8>, IPackedVector<byte>
     {
-        private static readonly Vector4 MaxBytes = new Vector4(255F);
-        private static readonly Vector4 Half = new Vector4(0.5F);
+        private static readonly Vector4 MaxBytes = new(255F);
+        private static readonly Vector4 Half = new(0.5F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="L8"/> struct.
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/La16.cs b/src/ImageSharp/PixelFormats/PixelImplementations/La16.cs
index 5a69431a1d..877aaed81c 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/La16.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/La16.cs
@@ -16,8 +16,8 @@ namespace SixLabors.ImageSharp.PixelFormats
     [StructLayout(LayoutKind.Explicit)]
     public partial struct La16 : IPixel<La16>, IPackedVector<ushort>
     {
-        private static readonly Vector4 MaxBytes = new Vector4(255F);
-        private static readonly Vector4 Half = new Vector4(0.5F);
+        private static readonly Vector4 MaxBytes = new(255F);
+        private static readonly Vector4 Half = new(0.5F);
 
         /// <summary>
         /// Gets or sets the luminance component.
@@ -35,7 +35,7 @@ public partial struct La16 : IPixel<La16>, IPackedVector<ushort>
         /// Initializes a new instance of the <see cref="La16"/> struct.
         /// </summary>
         /// <param name="l">The luminance component.</param>
-        /// <param name="a">The alpha componant.</param>
+        /// <param name="a">The alpha component.</param>
         public La16(byte l, byte a)
         {
             this.L = l;
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/La32.cs b/src/ImageSharp/PixelFormats/PixelImplementations/La32.cs
index 66d0e38c79..f19f228136 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/La32.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/La32.cs
@@ -35,7 +35,7 @@ public partial struct La32 : IPixel<La32>, IPackedVector<uint>
         /// Initializes a new instance of the <see cref="La32"/> struct.
         /// </summary>
         /// <param name="l">The luminance component.</param>
-        /// <param name="a">The alpha componant.</param>
+        /// <param name="a">The alpha component.</param>
         public La32(ushort l, ushort a)
         {
             this.L = l;
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs
index 720a1eef65..62eaf949d1 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte2.cs
@@ -17,8 +17,8 @@ public partial struct NormalizedByte2 : IPixel<NormalizedByte2>, IPackedVector<u
     {
         private const float MaxPos = 127F;
 
-        private static readonly Vector2 Half = new Vector2(MaxPos);
-        private static readonly Vector2 MinusOne = new Vector2(-1F);
+        private static readonly Vector2 Half = new(MaxPos);
+        private static readonly Vector2 MinusOne = new(-1F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="NormalizedByte2"/> struct.
@@ -93,7 +93,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4(this.ToVector2(), 0F, 1F);
+        public readonly Vector4 ToVector4() => new(this.ToVector2(), 0F, 1F);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -153,12 +153,9 @@ public void FromVector4(Vector4 vector)
         /// </summary>
         /// <returns>The <see cref="Vector2"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector2 ToVector2()
-        {
-            return new Vector2(
+        public readonly Vector2 ToVector2() => new(
                 (sbyte)((this.PackedValue >> 0) & 0xFF) / MaxPos,
                 (sbyte)((this.PackedValue >> 8) & 0xFF) / MaxPos);
-        }
 
         /// <inheritdoc />
         public override readonly bool Equals(object obj) => obj is NormalizedByte2 other && this.Equals(other);
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs
index d1b4b73f2b..2e81b3e2dc 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedByte4.cs
@@ -17,8 +17,8 @@ public partial struct NormalizedByte4 : IPixel<NormalizedByte4>, IPackedVector<u
     {
         private const float MaxPos = 127F;
 
-        private static readonly Vector4 Half = new Vector4(MaxPos);
-        private static readonly Vector4 MinusOne = new Vector4(-1F);
+        private static readonly Vector4 Half = new(MaxPos);
+        private static readonly Vector4 MinusOne = new(-1F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="NormalizedByte4"/> struct.
@@ -91,14 +91,11 @@ public readonly Vector4 ToScaledVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4()
-        {
-            return new Vector4(
+        public readonly Vector4 ToVector4() => new(
                 (sbyte)((this.PackedValue >> 0) & 0xFF) / MaxPos,
                 (sbyte)((this.PackedValue >> 8) & 0xFF) / MaxPos,
                 (sbyte)((this.PackedValue >> 16) & 0xFF) / MaxPos,
                 (sbyte)((this.PackedValue >> 24) & 0xFF) / MaxPos);
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -142,10 +139,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs
index d08a546031..b97aaacec8 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort2.cs
@@ -18,7 +18,7 @@ public partial struct NormalizedShort2 : IPixel<NormalizedShort2>, IPackedVector
         // Largest two byte positive number 0xFFFF >> 1;
         private const float MaxPos = 0x7FFF;
 
-        private static readonly Vector2 Max = new Vector2(MaxPos);
+        private static readonly Vector2 Max = new(MaxPos);
         private static readonly Vector2 Min = Vector2.Negate(Max);
 
         /// <summary>
@@ -138,10 +138,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -157,12 +154,9 @@ public void ToRgba32(ref Rgba32 dest)
         /// </summary>
         /// <returns>The <see cref="Vector2"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector2 ToVector2()
-        {
-            return new Vector2(
+        public readonly Vector2 ToVector2() => new(
                 (short)(this.PackedValue & 0xFFFF) / MaxPos,
                 (short)(this.PackedValue >> 0x10) / MaxPos);
-        }
 
         /// <inheritdoc />
         public override readonly bool Equals(object obj) => obj is NormalizedShort2 other && this.Equals(other);
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs
index 158b6eb4b0..f2e8aedd8f 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/NormalizedShort4.cs
@@ -18,7 +18,7 @@ public partial struct NormalizedShort4 : IPixel<NormalizedShort4>, IPackedVector
         // Largest two byte positive number 0xFFFF >> 1;
         private const float MaxPos = 0x7FFF;
 
-        private static readonly Vector4 Max = new Vector4(MaxPos);
+        private static readonly Vector4 Max = new(MaxPos);
         private static readonly Vector4 Min = Vector4.Negate(Max);
 
         /// <summary>
@@ -92,14 +92,11 @@ public readonly Vector4 ToScaledVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4()
-        {
-            return new Vector4(
+        public readonly Vector4 ToVector4() => new(
                          (short)((this.PackedValue >> 0x00) & 0xFFFF) / MaxPos,
                          (short)((this.PackedValue >> 0x10) & 0xFFFF) / MaxPos,
                          (short)((this.PackedValue >> 0x20) & 0xFFFF) / MaxPos,
                          (short)((this.PackedValue >> 0x30) & 0xFFFF) / MaxPos);
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -143,10 +140,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Rg32.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Rg32.cs
index d7e6f53cf2..12b6e153f9 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Rg32.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Rg32.cs
@@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.PixelFormats
     /// </summary>
     public partial struct Rg32 : IPixel<Rg32>, IPackedVector<uint>
     {
-        private static readonly Vector2 Max = new Vector2(ushort.MaxValue);
+        private static readonly Vector2 Max = new(ushort.MaxValue);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Rg32"/> struct.
@@ -79,7 +79,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4(this.ToVector2(), 0F, 1F);
+        public readonly Vector4 ToVector4() => new(this.ToVector2(), 0F, 1F);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -123,10 +123,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Rgb24.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Rgb24.cs
index 7fd63c6766..3b5bdb3d5a 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Rgb24.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Rgb24.cs
@@ -36,8 +36,8 @@ public partial struct Rgb24 : IPixel<Rgb24>
         [FieldOffset(2)]
         public byte B;
 
-        private static readonly Vector4 MaxBytes = new Vector4(byte.MaxValue);
-        private static readonly Vector4 Half = new Vector4(0.5F);
+        private static readonly Vector4 MaxBytes = new(byte.MaxValue);
+        private static readonly Vector4 Half = new(0.5F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Rgb24"/> struct.
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Rgb48.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Rgb48.cs
index e3738b70c1..d16b7db7ac 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Rgb48.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Rgb48.cs
@@ -93,7 +93,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4(this.R / Max, this.G / Max, this.B / Max, 1F);
+        public readonly Vector4 ToVector4() => new(this.R / Max, this.G / Max, this.B / Max, 1F);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Rgba1010102.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Rgba1010102.cs
index dee2f9fcb6..e687260187 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Rgba1010102.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Rgba1010102.cs
@@ -16,7 +16,7 @@ namespace SixLabors.ImageSharp.PixelFormats
     /// </summary>
     public partial struct Rgba1010102 : IPixel<Rgba1010102>, IPackedVector<uint>
     {
-        private static readonly Vector4 Multiplier = new Vector4(1023F, 1023F, 1023F, 3F);
+        private static readonly Vector4 Multiplier = new(1023F, 1023F, 1023F, 3F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Rgba1010102"/> struct.
@@ -78,14 +78,11 @@ public Rgba1010102(float x, float y, float z, float w)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4()
-        {
-            return new Vector4(
+        public readonly Vector4 ToVector4() => new Vector4(
                 (this.PackedValue >> 0) & 0x03FF,
                 (this.PackedValue >> 10) & 0x03FF,
                 (this.PackedValue >> 20) & 0x03FF,
                 (this.PackedValue >> 30) & 0x03) / Multiplier;
-        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -129,10 +126,7 @@ public readonly Vector4 ToVector4()
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest.FromScaledVector4(this.ToScaledVector4());
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest.FromScaledVector4(this.ToScaledVector4());
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Rgba32.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Rgba32.cs
index 868165e9c4..3dc6490f1b 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Rgba32.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Rgba32.cs
@@ -44,8 +44,8 @@ public partial struct Rgba32 : IPixel<Rgba32>, IPackedVector<uint>
         /// </summary>
         public byte A;
 
-        private static readonly Vector4 MaxBytes = new Vector4(byte.MaxValue);
-        private static readonly Vector4 Half = new Vector4(0.5F);
+        private static readonly Vector4 MaxBytes = new(byte.MaxValue);
+        private static readonly Vector4 Half = new(0.5F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Rgba32"/> struct.
@@ -137,7 +137,7 @@ public uint Rgba
         public Rgb24 Rgb
         {
             [MethodImpl(InliningOptions.ShortMethod)]
-            readonly get => new Rgb24(this.R, this.G, this.B);
+            readonly get => new(this.R, this.G, this.B);
 
             [MethodImpl(InliningOptions.ShortMethod)]
             set
@@ -154,7 +154,7 @@ public Rgb24 Rgb
         public Bgr24 Bgr
         {
             [MethodImpl(InliningOptions.ShortMethod)]
-            readonly get => new Bgr24(this.R, this.G, this.B);
+            readonly get => new(this.R, this.G, this.B);
 
             [MethodImpl(InliningOptions.ShortMethod)]
             set
@@ -181,7 +181,7 @@ public uint PackedValue
         /// <param name="source">The <see cref="Rgba32"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static implicit operator Color(Rgba32 source) => new Color(source);
+        public static implicit operator Color(Rgba32 source) => new(source);
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Rgba32"/>.
@@ -393,10 +393,7 @@ public void FromRgb24(Rgb24 source)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void ToRgba32(ref Rgba32 dest)
-        {
-            dest = this;
-        }
+        public void ToRgba32(ref Rgba32 dest) => dest = this;
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -424,7 +421,7 @@ public void FromRgba64(Rgba64 source)
         /// <returns>A hexadecimal string representation of the value.</returns>
         public readonly string ToHex()
         {
-            uint hexOrder = (uint)(this.A << 0 | this.B << 8 | this.G << 16 | this.R << 24);
+            uint hexOrder = (uint)((this.A << 0) | (this.B << 8) | (this.G << 16) | (this.R << 24));
             return hexOrder.ToString("X8");
         }
 
@@ -523,7 +520,7 @@ private static string ToRgbaHex(string hex)
                 return hex + "FF";
             }
 
-            if (hex.Length < 3 || hex.Length > 4)
+            if (hex.Length is < 3 or > 4)
             {
                 return null;
             }
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Rgba64.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Rgba64.cs
index 9add3d7180..4cfa0bf974 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Rgba64.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Rgba64.cs
@@ -162,7 +162,7 @@ public ulong PackedValue
         /// <param name="source">The <see cref="Rgba64"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static implicit operator Color(Rgba64 source) => new Color(source);
+        public static implicit operator Color(Rgba64 source) => new(source);
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Rgba64"/>.
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/RgbaVector.cs b/src/ImageSharp/PixelFormats/PixelImplementations/RgbaVector.cs
index 97e103d0f2..cd6f53c4ed 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/RgbaVector.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/RgbaVector.cs
@@ -43,8 +43,8 @@ public partial struct RgbaVector : IPixel<RgbaVector>
         public float A;
 
         private const float MaxBytes = byte.MaxValue;
-        private static readonly Vector4 Max = new Vector4(MaxBytes);
-        private static readonly Vector4 Half = new Vector4(0.5F);
+        private static readonly Vector4 Max = new(MaxBytes);
+        private static readonly Vector4 Half = new(0.5F);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="RgbaVector"/> struct.
@@ -120,7 +120,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4(this.R, this.G, this.B, this.A);
+        public readonly Vector4 ToVector4() => new(this.R, this.G, this.B, this.A);
 
         /// <inheritdoc/>
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -183,7 +183,7 @@ public readonly string ToHex()
             // Hex is RRGGBBAA
             Vector4 vector = this.ToVector4() * Max;
             vector += Half;
-            uint hexOrder = (uint)((byte)vector.W | (byte)vector.Z << 8 | (byte)vector.Y << 16 | (byte)vector.X << 24);
+            uint hexOrder = (uint)((byte)vector.W | ((byte)vector.Z << 8) | ((byte)vector.Y << 16) | ((byte)vector.X << 24));
             return hexOrder.ToString("X8");
         }
 
@@ -199,10 +199,7 @@ public readonly bool Equals(RgbaVector other) =>
             && this.A.Equals(other.A);
 
         /// <inheritdoc/>
-        public override readonly string ToString()
-        {
-            return FormattableString.Invariant($"RgbaVector({this.R:#0.##}, {this.G:#0.##}, {this.B:#0.##}, {this.A:#0.##})");
-        }
+        public override readonly string ToString() => FormattableString.Invariant($"RgbaVector({this.R:#0.##}, {this.G:#0.##}, {this.B:#0.##}, {this.A:#0.##})");
 
         /// <inheritdoc/>
         public override readonly int GetHashCode() => HashCode.Combine(this.R, this.G, this.B, this.A);
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs
index 101027a78e..24f6b4d1d4 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Short2.cs
@@ -21,8 +21,8 @@ public partial struct Short2 : IPixel<Short2>, IPackedVector<uint>
         // Two's complement
         private const float MinNeg = ~(int)MaxPos;
 
-        private static readonly Vector2 Max = new Vector2(MaxPos);
-        private static readonly Vector2 Min = new Vector2(MinNeg);
+        private static readonly Vector2 Max = new(MaxPos);
+        private static readonly Vector2 Min = new(MinNeg);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Short2"/> struct.
@@ -97,7 +97,7 @@ public void FromVector4(Vector4 vector)
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector4 ToVector4() => new Vector4((short)(this.PackedValue & 0xFFFF), (short)(this.PackedValue >> 0x10), 0, 1);
+        public readonly Vector4 ToVector4() => new((short)(this.PackedValue & 0xFFFF), (short)(this.PackedValue >> 0x10), 0, 1);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -157,7 +157,7 @@ public void FromVector4(Vector4 vector)
         /// </summary>
         /// <returns>The <see cref="Vector2"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public readonly Vector2 ToVector2() => new Vector2((short)(this.PackedValue & 0xFFFF), (short)(this.PackedValue >> 0x10));
+        public readonly Vector2 ToVector2() => new((short)(this.PackedValue & 0xFFFF), (short)(this.PackedValue >> 0x10));
 
         /// <inheritdoc />
         public override readonly bool Equals(object obj) => obj is Short2 other && this.Equals(other);
diff --git a/tests/ImageSharp.Tests/PixelFormats/Bgr24Tests.cs b/tests/ImageSharp.Tests/PixelFormats/Bgr24Tests.cs
index f6a6d44bb4..36cdd157d9 100644
--- a/tests/ImageSharp.Tests/PixelFormats/Bgr24Tests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/Bgr24Tests.cs
@@ -28,8 +28,7 @@ public void AreNotEqual()
             Assert.NotEqual(color1, color2);
         }
 
-        public static readonly TheoryData<byte, byte, byte> ColorData =
-            new TheoryData<byte, byte, byte> { { 1, 2, 3 }, { 4, 5, 6 }, { 0, 255, 42 } };
+        public static readonly TheoryData<byte, byte, byte> ColorData = new() { { 1, 2, 3 }, { 4, 5, 6 }, { 0, 255, 42 } };
 
         [Theory]
         [MemberData(nameof(ColorData))]
diff --git a/tests/ImageSharp.Tests/PixelFormats/Bgra32Tests.cs b/tests/ImageSharp.Tests/PixelFormats/Bgra32Tests.cs
index b7fbdde714..4b8f4c2eaf 100644
--- a/tests/ImageSharp.Tests/PixelFormats/Bgra32Tests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/Bgra32Tests.cs
@@ -35,10 +35,13 @@ public void AreNotEqual()
         }
 
         public static readonly TheoryData<byte, byte, byte, byte> ColorData =
-            new TheoryData<byte, byte, byte, byte>
-                {
-                    { 1, 2, 3, 4 }, { 4, 5, 6, 7 }, { 0, 255, 42, 0 }, { 1, 2, 3, 255 }
-                };
+            new()
+            {
+                { 1, 2, 3, 4 },
+                { 4, 5, 6, 7 },
+                { 0, 255, 42, 0 },
+                { 1, 2, 3, 255 }
+            };
 
         [Theory]
         [MemberData(nameof(ColorData))]
diff --git a/tests/ImageSharp.Tests/PixelFormats/L8Tests.cs b/tests/ImageSharp.Tests/PixelFormats/L8Tests.cs
index d877283c1d..fc91590d22 100644
--- a/tests/ImageSharp.Tests/PixelFormats/L8Tests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/L8Tests.cs
@@ -12,29 +12,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
     public class L8Tests
     {
         public static readonly TheoryData<byte> LuminanceData
-            = new TheoryData<byte>
-            {
-                0,
-                1,
-                2,
-                3,
-                5,
-                13,
-                31,
-                71,
-                73,
-                79,
-                83,
-                109,
-                127,
-                128,
-                131,
-                199,
-                250,
-                251,
-                254,
-                255
-            };
+            = new() { 0, 1, 2, 3, 5, 13, 31, 71, 73, 79, 83, 109, 127, 128, 131, 199, 250, 251, 254, 255 };
 
         [Theory]
         [InlineData(0)]
diff --git a/tests/ImageSharp.Tests/PixelFormats/La16Tests.cs b/tests/ImageSharp.Tests/PixelFormats/La16Tests.cs
index 2c9a27028d..7e082147eb 100644
--- a/tests/ImageSharp.Tests/PixelFormats/La16Tests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/La16Tests.cs
@@ -12,29 +12,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
     public class La16Tests
     {
         public static readonly TheoryData<byte> LuminanceData
-            = new TheoryData<byte>
-            {
-                0,
-                1,
-                2,
-                3,
-                5,
-                13,
-                31,
-                71,
-                73,
-                79,
-                83,
-                109,
-                127,
-                128,
-                131,
-                199,
-                250,
-                251,
-                254,
-                255
-            };
+            = new() { 0, 1, 2, 3, 5, 13, 31, 71, 73, 79, 83, 109, 127, 128, 131, 199, 250, 251, 254, 255 };
 
         [Theory]
         [InlineData(0, 0)]
diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelBlenderTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelBlenderTests.cs
index 7954f1aff1..5988cc851a 100644
--- a/tests/ImageSharp.Tests/PixelFormats/PixelBlenderTests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/PixelBlenderTests.cs
@@ -12,7 +12,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
     [Trait("Category", "PixelFormats")]
     public class PixelBlenderTests
     {
-        public static TheoryData<object, Type, PixelColorBlendingMode> BlenderMappings = new TheoryData<object, Type, PixelColorBlendingMode>
+        public static TheoryData<object, Type, PixelColorBlendingMode> BlenderMappings = new()
         {
             { new TestPixel<Rgba32>(), typeof(DefaultPixelBlenders<Rgba32>.NormalSrcOver), PixelColorBlendingMode.Normal },
             { new TestPixel<Rgba32>(), typeof(DefaultPixelBlenders<Rgba32>.ScreenSrcOver), PixelColorBlendingMode.Screen },
@@ -43,7 +43,7 @@ public void ReturnsCorrectBlender<TPixel>(TestPixel<TPixel> pixel, Type type, Pi
             Assert.IsType(type, blender);
         }
 
-        public static TheoryData<Rgba32, Rgba32, float, PixelColorBlendingMode, Rgba32> ColorBlendingExpectedResults = new TheoryData<Rgba32, Rgba32, float, PixelColorBlendingMode, Rgba32>
+        public static TheoryData<Rgba32, Rgba32, float, PixelColorBlendingMode, Rgba32> ColorBlendingExpectedResults = new()
         {
             { Color.MistyRose, Color.MidnightBlue, 1, PixelColorBlendingMode.Normal, Color.MidnightBlue },
             { Color.MistyRose, Color.MidnightBlue, 1, PixelColorBlendingMode.Screen, new Rgba32(0xFFEEE7FF) },
@@ -67,7 +67,7 @@ public void TestColorBlendingModes(Rgba32 backdrop, Rgba32 source, float opacity
             Assert.Equal(actualResult.ToVector4(), expectedResult.ToVector4());
         }
 
-        public static TheoryData<Rgba32, Rgba32, float, PixelAlphaCompositionMode, Rgba32> AlphaCompositionExpectedResults = new TheoryData<Rgba32, Rgba32, float, PixelAlphaCompositionMode, Rgba32>
+        public static TheoryData<Rgba32, Rgba32, float, PixelAlphaCompositionMode, Rgba32> AlphaCompositionExpectedResults = new()
         {
             { Color.MistyRose, Color.MidnightBlue, 1, PixelAlphaCompositionMode.Clear, new Rgba32(0) },
             { Color.MistyRose, Color.MidnightBlue, 1, PixelAlphaCompositionMode.Xor, new Rgba32(0) },
diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs
index ec53629a80..315f9f7761 100644
--- a/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs
@@ -11,21 +11,21 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
     public abstract partial class PixelConverterTests
     {
         public static readonly TheoryData<byte, byte, byte, byte> RgbaData =
-            new TheoryData<byte, byte, byte, byte>
-                {
-                    { 0, 0, 0, 0 },
-                    { 0, 0, 0, 255 },
-                    { 0, 0, 255, 0 },
-                    { 0, 255, 0, 0 },
-                    { 255, 0, 0, 0 },
-                    { 255, 255, 255, 255 },
-                    { 0, 0, 0, 1 },
-                    { 0, 0, 1, 0 },
-                    { 0, 1, 0, 0 },
-                    { 1, 0, 0, 0 },
-                    { 3, 5, 7, 11 },
-                    { 67, 71, 101, 109 }
-                };
+            new()
+            {
+                { 0, 0, 0, 0 },
+                { 0, 0, 0, 255 },
+                { 0, 0, 255, 0 },
+                { 0, 255, 0, 0 },
+                { 255, 0, 0, 0 },
+                { 255, 255, 255, 255 },
+                { 0, 0, 0, 1 },
+                { 0, 0, 1, 0 },
+                { 0, 1, 0, 0 },
+                { 1, 0, 0, 0 },
+                { 3, 5, 7, 11 },
+                { 67, 71, 101, 109 }
+            };
 
         public class FromRgba32 : PixelConverterTests
         {
@@ -34,7 +34,7 @@ public class FromRgba32 : PixelConverterTests
             public void ToArgb32(byte r, byte g, byte b, byte a)
             {
                 byte[] source = ReferenceImplementations.MakeRgba32ByteArray(r, g, b, a);
-                var actual = new byte[source.Length];
+                byte[] actual = new byte[source.Length];
 
                 PixelConverter.FromRgba32.ToArgb32(source, actual);
 
@@ -48,7 +48,7 @@ public void ToArgb32(byte r, byte g, byte b, byte a)
             public void ToBgra32(byte r, byte g, byte b, byte a)
             {
                 byte[] source = ReferenceImplementations.MakeRgba32ByteArray(r, g, b, a);
-                var actual = new byte[source.Length];
+                byte[] actual = new byte[source.Length];
 
                 PixelConverter.FromRgba32.ToBgra32(source, actual);
 
@@ -65,7 +65,7 @@ public class FromArgb32 : PixelConverterTests
             public void ToRgba32(byte r, byte g, byte b, byte a)
             {
                 byte[] source = ReferenceImplementations.MakeArgb32ByteArray(r, g, b, a);
-                var actual = new byte[source.Length];
+                byte[] actual = new byte[source.Length];
 
                 PixelConverter.FromArgb32.ToRgba32(source, actual);
 
@@ -79,7 +79,7 @@ public void ToRgba32(byte r, byte g, byte b, byte a)
             public void ToBgra32(byte r, byte g, byte b, byte a)
             {
                 byte[] source = ReferenceImplementations.MakeArgb32ByteArray(r, g, b, a);
-                var actual = new byte[source.Length];
+                byte[] actual = new byte[source.Length];
 
                 PixelConverter.FromArgb32.ToBgra32(source, actual);
 
@@ -96,7 +96,7 @@ public class FromBgra32 : PixelConverterTests
             public void ToArgb32(byte r, byte g, byte b, byte a)
             {
                 byte[] source = ReferenceImplementations.MakeBgra32ByteArray(r, g, b, a);
-                var actual = new byte[source.Length];
+                byte[] actual = new byte[source.Length];
 
                 PixelConverter.FromBgra32.ToArgb32(source, actual);
 
@@ -110,7 +110,7 @@ public void ToArgb32(byte r, byte g, byte b, byte a)
             public void ToRgba32(byte r, byte g, byte b, byte a)
             {
                 byte[] source = ReferenceImplementations.MakeBgra32ByteArray(r, g, b, a);
-                var actual = new byte[source.Length];
+                byte[] actual = new byte[source.Length];
 
                 PixelConverter.FromBgra32.ToRgba32(source, actual);
 
diff --git a/tests/ImageSharp.Tests/PixelFormats/Rgb24Tests.cs b/tests/ImageSharp.Tests/PixelFormats/Rgb24Tests.cs
index 4d4f8c9fb9..6c98e623fd 100644
--- a/tests/ImageSharp.Tests/PixelFormats/Rgb24Tests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/Rgb24Tests.cs
@@ -11,7 +11,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
     public class Rgb24Tests
     {
         public static readonly TheoryData<byte, byte, byte> ColorData =
-            new TheoryData<byte, byte, byte>
+            new()
             {
                 { 1, 2, 3 },
                 { 4, 5, 6 },
@@ -76,7 +76,7 @@ public void FromRgba32()
             Assert.Equal(3, rgb.B);
         }
 
-        private static Vector4 Vec(byte r, byte g, byte b, byte a = 255) => new Vector4(
+        private static Vector4 Vec(byte r, byte g, byte b, byte a = 255) => new(
             r / 255f,
             g / 255f,
             b / 255f,
diff --git a/tests/ImageSharp.Tests/PixelFormats/UnPackedPixelTests.cs b/tests/ImageSharp.Tests/PixelFormats/UnPackedPixelTests.cs
index 9492fef90b..20484b073c 100644
--- a/tests/ImageSharp.Tests/PixelFormats/UnPackedPixelTests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/UnPackedPixelTests.cs
@@ -5,7 +5,7 @@
 using SixLabors.ImageSharp.PixelFormats;
 using Xunit;
 
-namespace SixLabors.ImageSharp.Tests.Colors
+namespace SixLabors.ImageSharp.Tests.PixelFormats
 {
     [Trait("Category", "PixelFormats")]
     public class UnPackedPixelTests

From e168ae6a2c8bb4774c871f1372bab4d7f8051b3d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 26 Oct 2021 16:42:23 +0200
Subject: [PATCH 03/85] Use Span in GetHTreeGroupForPos to avoid allocations

---
 .../Formats/Webp/Lossless/WebpLosslessDecoder.cs       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
index 9604160091..768365e44e 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
@@ -218,7 +218,7 @@ public void DecodeImageData(Vp8LDecoder decoder, Span<uint> pixelData)
             ColorCache colorCache = decoder.Metadata.ColorCache;
             int colorCacheLimit = lenCodeLimit + colorCacheSize;
             int mask = decoder.Metadata.HuffmanMask;
-            HTreeGroup[] hTreeGroup = GetHTreeGroupForPos(decoder.Metadata, col, row);
+            Span<HTreeGroup> hTreeGroup = GetHTreeGroupForPos(decoder.Metadata, col, row);
 
             int totalPixels = width * height;
             int decodedPixels = 0;
@@ -731,7 +731,7 @@ public void DecodeAlphaData(AlphaDecoder dec)
             int lastRow = height;
             const int lenCodeLimit = WebpConstants.NumLiteralCodes + WebpConstants.NumLengthCodes;
             int mask = hdr.HuffmanMask;
-            HTreeGroup[] htreeGroup = pos < last ? GetHTreeGroupForPos(hdr, col, row) : null;
+            Span<HTreeGroup> htreeGroup = pos < last ? GetHTreeGroupForPos(hdr, col, row) : null;
             while (!this.bitReader.Eos && pos < last)
             {
                 // Only update when changing tile.
@@ -815,7 +815,7 @@ private void UpdateDecoder(Vp8LDecoder decoder, int width, int height)
             decoder.Metadata.HuffmanMask = numBits == 0 ? ~0 : (1 << numBits) - 1;
         }
 
-        private uint ReadPackedSymbols(HTreeGroup[] group, Span<uint> pixelData, int decodedPixels)
+        private uint ReadPackedSymbols(Span<HTreeGroup> group, Span<uint> pixelData, int decodedPixels)
         {
             uint val = (uint)(this.bitReader.PrefetchBits() & (HuffmanUtils.HuffmanPackedTableSize - 1));
             HuffmanCode code = group[0].PackedTable[val];
@@ -895,10 +895,10 @@ private int GetCopyDistance(int distanceSymbol)
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private static HTreeGroup[] GetHTreeGroupForPos(Vp8LMetadata metadata, int x, int y)
+        private static Span<HTreeGroup> GetHTreeGroupForPos(Vp8LMetadata metadata, int x, int y)
         {
             uint metaIndex = GetMetaIndex(metadata.HuffmanImage, metadata.HuffmanXSize, metadata.HuffmanSubSampleBits, x, y);
-            return metadata.HTreeGroups.AsSpan((int)metaIndex).ToArray();
+            return metadata.HTreeGroups.AsSpan((int)metaIndex);
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]

From b50f146fe2a163b4f4818745a55eec08992a8cd8 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 27 Oct 2021 13:17:45 -0700
Subject: [PATCH 04/85] Support running on arm4

---
 tests/ImageSharp.Benchmarks/Config.cs                    | 4 ++--
 tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Config.cs b/tests/ImageSharp.Benchmarks/Config.cs
index 9221bb7fda..2997848211 100644
--- a/tests/ImageSharp.Benchmarks/Config.cs
+++ b/tests/ImageSharp.Benchmarks/Config.cs
@@ -34,7 +34,7 @@ public class MultiFramework : Config
             public MultiFramework() => this.AddJob(
                     Job.Default.WithRuntime(ClrRuntime.Net472),
                     Job.Default.WithRuntime(CoreRuntime.Core31),
-                    Job.Default.WithRuntime(CoreRuntime.Core50));
+                    Job.Default.WithRuntime(CoreRuntime.Core50).With(new Argument[] { new MsBuildArgument("/p:DebugType=portable") }));
         }
 
         public class ShortMultiFramework : Config
@@ -42,7 +42,7 @@ public class ShortMultiFramework : Config
             public ShortMultiFramework() => this.AddJob(
                     Job.Default.WithRuntime(ClrRuntime.Net472).WithLaunchCount(1).WithWarmupCount(3).WithIterationCount(3),
                     Job.Default.WithRuntime(CoreRuntime.Core31).WithLaunchCount(1).WithWarmupCount(3).WithIterationCount(3),
-                    Job.Default.WithRuntime(CoreRuntime.Core50).WithLaunchCount(1).WithWarmupCount(3).WithIterationCount(3));
+                    Job.Default.WithRuntime(CoreRuntime.Core50).WithLaunchCount(1).WithWarmupCount(3).WithIterationCount(3).With(new Argument[] { new MsBuildArgument("/p:DebugType=portable") }));
         }
 
         public class ShortCore31 : Config
diff --git a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj
index b9ab31972f..8f0b4a86f2 100644
--- a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj
+++ b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj
@@ -6,6 +6,7 @@
     <OutputType>Exe</OutputType>
     <RootNamespace>SixLabors.ImageSharp.Benchmarks</RootNamespace>
     <GenerateProgramFile>false</GenerateProgramFile>
+    <DebugType>portable</DebugType>
     <!--Used to hide test project from dotnet test-->
     <IsTestProject>false</IsTestProject>
     <Configurations>Debug;Release;Debug-InnerLoop;Release-InnerLoop</Configurations>

From 257ff1929e341e5b1af94d9adf557e5296ece957 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Fri, 29 Oct 2021 23:32:13 +1100
Subject: [PATCH 05/85] Use RgbaVector for color backing

---
 src/ImageSharp/Color/Color.Conversions.cs     | 87 ++++++++++++++++---
 src/ImageSharp/Color/Color.cs                 | 74 ++++++++--------
 .../Color/ColorTests.CastFrom.cs              | 17 +++-
 .../Color/ColorTests.ConstructFrom.cs         |  4 +-
 4 files changed, 125 insertions(+), 57 deletions(-)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 0455fd26a4..abcb54b807 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -17,56 +17,90 @@ public readonly partial struct Color
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba64"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba64 pixel) => this.data = pixel;
+        public Color(Rgba64 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromRgba64(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgba32 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromRgba32(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Argb32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Argb32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Argb32 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromArgb32(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgra32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgra32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgra32 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromBgra32(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgb24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgb24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgb24 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromRgb24(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgr24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgr24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgr24 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromBgr24(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="vector">The <see cref="Vector4"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Vector4 vector) => this.data = new Rgba64(vector);
+        public Color(Vector4 vector)
+        {
+            vector = Numerics.Clamp(vector, Vector4.Zero, Vector4.One);
+            this.data = new RgbaVector(vector.X, vector.Y, vector.Z, vector.W);
+        }
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Vector4"/>.
         /// </summary>
         /// <param name="color">The <see cref="Color"/>.</param>
         /// <returns>The <see cref="Vector4"/>.</returns>
-        public static explicit operator Vector4(Color color) => color.data.ToVector4();
+        public static explicit operator Vector4(Color color) => color.data.ToScaledVector4();
 
         /// <summary>
         /// Converts an <see cref="Vector4"/> to <see cref="Color"/>.
@@ -74,22 +108,47 @@ public readonly partial struct Color
         /// <param name="source">The <see cref="Vector4"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static explicit operator Color(Vector4 source) => new Color(source);
+        public static explicit operator Color(Vector4 source) => new(source);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgba32 ToRgba32() => this.data.ToRgba32();
+        internal Rgba32 ToRgba32()
+        {
+            Rgba32 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgra32 ToBgra32() => this.data.ToBgra32();
+        internal Bgra32 ToBgra32()
+        {
+            Bgra32 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Argb32 ToArgb32() => this.data.ToArgb32();
+        internal Argb32 ToArgb32()
+        {
+            Argb32 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgb24 ToRgb24() => this.data.ToRgb24();
+        internal Rgb24 ToRgb24()
+        {
+            Rgb24 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgr24 ToBgr24() => this.data.ToBgr24();
+        internal Bgr24 ToBgr24()
+        {
+            Bgr24 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         internal Vector4 ToVector4() => this.data.ToVector4();
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index d5eedc160b..9a4df4e629 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -20,26 +20,22 @@ namespace SixLabors.ImageSharp
     /// </remarks>
     public readonly partial struct Color : IEquatable<Color>
     {
-        private readonly Rgba64 data;
+        private readonly RgbaVector data;
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b, byte a)
         {
-            this.data = new Rgba64(
-                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(a));
+            RgbaVector vector = default;
+            vector.FromRgba32(new(r, g, b, a));
+            this.data = vector;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b)
         {
-            this.data = new Rgba64(
-                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
-                ushort.MaxValue);
+            RgbaVector vector = default;
+            vector.FromRgba32(new(r, g, b));
+            this.data = vector;
         }
 
         /// <summary>
@@ -52,10 +48,7 @@ private Color(byte r, byte g, byte b)
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator ==(Color left, Color right)
-        {
-            return left.Equals(right);
-        }
+        public static bool operator ==(Color left, Color right) => left.Equals(right);
 
         /// <summary>
         /// Checks whether two <see cref="Color"/> structures are equal.
@@ -67,10 +60,7 @@ private Color(byte r, byte g, byte b)
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator !=(Color left, Color right)
-        {
-            return !left.Equals(right);
-        }
+        public static bool operator !=(Color left, Color right) => !left.Equals(right);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGBA bytes.
@@ -81,7 +71,7 @@ private Color(byte r, byte g, byte b)
         /// <param name="a">The alpha component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgba(byte r, byte g, byte b, byte a) => new Color(r, g, b, a);
+        public static Color FromRgba(byte r, byte g, byte b, byte a) => new(r, g, b, a);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGB bytes.
@@ -91,7 +81,17 @@ private Color(byte r, byte g, byte b)
         /// <param name="b">The blue component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgb(byte r, byte g, byte b) => new Color(r, g, b);
+        public static Color FromRgb(byte r, byte g, byte b) => new(r, g, b);
+
+        /// <summary>
+        /// Creates a <see cref="Color"/> from the given <typeparamref name="TPixel"/>.
+        /// </summary>
+        /// <param name="pixel">The pixel to convert from.</param>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <returns>The <see cref="Color"/>.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static Color FromPixel<TPixel>(TPixel pixel)
+            where TPixel : unmanaged, IPixel<TPixel> => new(pixel.ToScaledVector4());
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct
@@ -207,13 +207,18 @@ public Color WithAlpha(float alpha)
         /// </summary>
         /// <returns>A hexadecimal string representation of the value.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public string ToHex() => this.data.ToRgba32().ToHex();
+        public string ToHex()
+        {
+            Rgba32 rgba = default;
+            this.data.ToRgba32(ref rgba);
+            return rgba.ToHex();
+        }
 
         /// <inheritdoc />
         public override string ToString() => this.ToHex();
 
         /// <summary>
-        /// Converts the color instance to a specified <see cref="IPixel{TSelf}"/> type.
+        /// Converts the color instance to a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <returns>The pixel value.</returns>
@@ -222,12 +227,12 @@ public TPixel ToPixel<TPixel>()
             where TPixel : unmanaged, IPixel<TPixel>
         {
             TPixel pixel = default;
-            pixel.FromRgba64(this.data);
+            pixel.FromScaledVector4(this.data.ToScaledVector4());
             return pixel;
         }
 
         /// <summary>
-        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <see cref="IPixel{TSelf}"/> type.
+        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <param name="configuration">The configuration.</param>
@@ -240,28 +245,19 @@ public static void ToPixel<TPixel>(
             Span<TPixel> destination)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            ReadOnlySpan<Rgba64> rgba64Span = MemoryMarshal.Cast<Color, Rgba64>(source);
-            PixelOperations<TPixel>.Instance.FromRgba64(configuration, rgba64Span, destination);
+            ReadOnlySpan<RgbaVector> rgbaSpan = MemoryMarshal.Cast<Color, RgbaVector>(source);
+            PixelOperations<TPixel>.Instance.From(configuration, rgbaSpan, destination);
         }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public bool Equals(Color other)
-        {
-            return this.data.PackedValue == other.data.PackedValue;
-        }
+        public bool Equals(Color other) => this.data.Equals(other.data);
 
         /// <inheritdoc />
-        public override bool Equals(object obj)
-        {
-            return obj is Color other && this.Equals(other);
-        }
+        public override bool Equals(object obj) => obj is Color other && this.Equals(other);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public override int GetHashCode()
-        {
-            return this.data.PackedValue.GetHashCode();
-        }
+        public override int GetHashCode() => this.data.GetHashCode();
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
index 38b94f486c..356ef7351e 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
@@ -66,7 +66,7 @@ public void Bgra32()
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22,  231);
+                var source = new Rgb24(1, 22, 231);
 
                 // Act:
                 Color color = source;
@@ -79,7 +79,7 @@ public void Rgb24()
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22,  231);
+                var source = new Bgr24(1, 22, 231);
 
                 // Act:
                 Color color = source;
@@ -88,6 +88,19 @@ public void Bgr24()
                 Bgr24 data = color.ToPixel<Bgr24>();
                 Assert.Equal(source, data);
             }
+
+            [Fact]
+            public void TPixel()
+            {
+                var source = new RgbaVector(1, .1F, .133F, .864F);
+
+                // Act:
+                var color = Color.FromPixel(source);
+
+                // Assert:
+                RgbaVector data = color.ToPixel<RgbaVector>();
+                Assert.Equal(source, data);
+            }
         }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
index 89276014b0..dd51f3a6c2 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
@@ -66,7 +66,7 @@ public void Bgra32()
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22,  231);
+                var source = new Rgb24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);
@@ -79,7 +79,7 @@ public void Rgb24()
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22,  231);
+                var source = new Bgr24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);

From c68ef21613e237dc4220ecfe80347693527b192b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 29 Oct 2021 17:29:56 +0200
Subject: [PATCH 06/85] Write exif profile with padding if needed

---
 .../Formats/Webp/BitWriter/BitWriterBase.cs   | 49 +++++++++++++++----
 .../Formats/Webp/BitWriter/Vp8BitWriter.cs    |  4 +-
 .../Formats/Webp/BitWriter/Vp8LBitWriter.cs   |  9 ++--
 .../Formats/Webp/WebpEncoderCore.cs           |  2 -
 .../Formats/WebP/WebpMetaDataTests.cs         | 25 ++++++++++
 5 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
index 41623f2878..31e636b6bc 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
@@ -10,11 +10,22 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
 {
     internal abstract class BitWriterBase
     {
+        private const uint MaxDimension = 16777215;
+
+        private const ulong MaxCanvasPixels = 4294967295ul;
+
+        protected const uint ExtendedFileChunkSize = WebpConstants.ChunkHeaderSize + WebpConstants.Vp8XChunkSize;
+
         /// <summary>
         /// Buffer to write to.
         /// </summary>
         private byte[] buffer;
 
+        /// <summary>
+        /// A scratch buffer to reduce allocations.
+        /// </summary>
+        private readonly byte[] scratchBuffer = new byte[4];
+
         /// <summary>
         /// Initializes a new instance of the <see cref="BitWriterBase"/> class.
         /// </summary>
@@ -81,13 +92,25 @@ protected void ResizeBuffer(int maxBytes, int sizeRequired)
         /// <param name="riffSize">The block length.</param>
         protected void WriteRiffHeader(Stream stream, uint riffSize)
         {
-            Span<byte> buf = stackalloc byte[4];
             stream.Write(WebpConstants.RiffFourCc);
-            BinaryPrimitives.WriteUInt32LittleEndian(buf, riffSize);
-            stream.Write(buf);
+            BinaryPrimitives.WriteUInt32LittleEndian(this.scratchBuffer, riffSize);
+            stream.Write(this.scratchBuffer.AsSpan(0, 4));
             stream.Write(WebpConstants.WebpHeader);
         }
 
+        /// <summary>
+        /// Calculates the exif chunk size.
+        /// </summary>
+        /// <param name="exifBytes">The exif profile bytes.</param>
+        /// <returns>The exif chunk size in bytes.</returns>
+        protected uint ExifChunkSize(byte[] exifBytes)
+        {
+            uint exifSize = (uint)exifBytes.Length;
+            uint exifChunkSize = WebpConstants.ChunkHeaderSize + exifSize + (exifSize & 1);
+
+            return exifChunkSize;
+        }
+
         /// <summary>
         /// Writes the Exif profile to the stream.
         /// </summary>
@@ -97,12 +120,19 @@ protected void WriteExifProfile(Stream stream, byte[] exifBytes)
         {
             DebugGuard.NotNull(exifBytes, nameof(exifBytes));
 
-            Span<byte> buf = stackalloc byte[4];
+            uint size = (uint)exifBytes.Length;
+            Span<byte> buf = this.scratchBuffer.AsSpan(0, 4);
             BinaryPrimitives.WriteUInt32BigEndian(buf, (uint)WebpChunkType.Exif);
             stream.Write(buf);
-            BinaryPrimitives.WriteUInt32LittleEndian(buf, (uint)exifBytes.Length);
+            BinaryPrimitives.WriteUInt32LittleEndian(buf, size);
             stream.Write(buf);
             stream.Write(exifBytes);
+
+            // Add padding byte if needed.
+            if ((size & 1) == 1)
+            {
+                stream.WriteByte(0);
+            }
         }
 
         /// <summary>
@@ -114,14 +144,13 @@ protected void WriteExifProfile(Stream stream, byte[] exifBytes)
         /// <param name="height">The height of the image.</param>
         protected void WriteVp8XHeader(Stream stream, ExifProfile exifProfile, uint width, uint height)
         {
-            int maxDimension = 16777215;
-            if (width > maxDimension || height > maxDimension)
+            if (width > MaxDimension || height > MaxDimension)
             {
-                WebpThrowHelper.ThrowInvalidImageDimensions($"Image width or height exceeds maximum allowed dimension of {maxDimension}");
+                WebpThrowHelper.ThrowInvalidImageDimensions($"Image width or height exceeds maximum allowed dimension of {MaxDimension}");
             }
 
             // The spec states that the product of Canvas Width and Canvas Height MUST be at most 2^32 - 1.
-            if (width * height > 4294967295ul)
+            if (width * height > MaxCanvasPixels)
             {
                 WebpThrowHelper.ThrowInvalidImageDimensions("The product of image width and height MUST be at most 2^32 - 1");
             }
@@ -133,7 +162,7 @@ protected void WriteVp8XHeader(Stream stream, ExifProfile exifProfile, uint widt
                 flags |= 8;
             }
 
-            Span<byte> buf = stackalloc byte[4];
+            Span<byte> buf = this.scratchBuffer.AsSpan(0, 4);
             stream.Write(WebpConstants.Vp8XMagicBytes);
             BinaryPrimitives.WriteUInt32LittleEndian(buf, WebpConstants.Vp8XChunkSize);
             stream.Write(buf);
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
index 7628247fd6..2c943f64f0 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
@@ -408,9 +408,9 @@ public override void WriteEncodedImageToStream(Stream stream, ExifProfile exifPr
             if (exifProfile != null)
             {
                 isVp8X = true;
-                riffSize += WebpConstants.ChunkHeaderSize + WebpConstants.Vp8XChunkSize;
+                riffSize += ExtendedFileChunkSize;
                 exifBytes = exifProfile.ToByteArray();
-                riffSize += WebpConstants.ChunkHeaderSize + (uint)exifBytes.Length;
+                riffSize += this.ExifChunkSize(exifBytes);
             }
 
             this.Finish();
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs b/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs
index 2f942231fb..2ce2f5550c 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs
@@ -130,16 +130,15 @@ public override void Finish()
         /// <inheritdoc/>
         public override void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, uint width, uint height)
         {
-            Span<byte> buffer = stackalloc byte[4];
             bool isVp8X = false;
             byte[] exifBytes = null;
             uint riffSize = 0;
             if (exifProfile != null)
             {
                 isVp8X = true;
-                riffSize += WebpConstants.ChunkHeaderSize + WebpConstants.Vp8XChunkSize;
+                riffSize += ExtendedFileChunkSize;
                 exifBytes = exifProfile.ToByteArray();
-                riffSize += WebpConstants.ChunkHeaderSize + (uint)exifBytes.Length;
+                riffSize += this.ExifChunkSize(exifBytes);
             }
 
             this.Finish();
@@ -161,8 +160,8 @@ public override void WriteEncodedImageToStream(Stream stream, ExifProfile exifPr
             stream.Write(WebpConstants.Vp8LMagicBytes);
 
             // Write Vp8 Header.
-            BinaryPrimitives.WriteUInt32LittleEndian(buffer, size);
-            stream.Write(buffer);
+            BinaryPrimitives.WriteUInt32LittleEndian(this.scratchBuffer, size);
+            stream.Write(this.scratchBuffer.AsSpan(0, 4));
             stream.WriteByte(WebpConstants.Vp8LHeaderMagicByte);
 
             // Write the encoded bytes of the image to the stream.
diff --git a/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs b/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
index a61fc72530..8640261b17 100644
--- a/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
+++ b/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
@@ -4,11 +4,9 @@
 using System.IO;
 using System.Threading;
 using SixLabors.ImageSharp.Advanced;
-using SixLabors.ImageSharp.Formats.Bmp;
 using SixLabors.ImageSharp.Formats.Webp.Lossless;
 using SixLabors.ImageSharp.Formats.Webp.Lossy;
 using SixLabors.ImageSharp.Memory;
-using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp.Formats.Webp
diff --git a/tests/ImageSharp.Tests/Formats/WebP/WebpMetaDataTests.cs b/tests/ImageSharp.Tests/Formats/WebP/WebpMetaDataTests.cs
index 81067a41f5..a051de1c01 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/WebpMetaDataTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/WebpMetaDataTests.cs
@@ -63,6 +63,31 @@ public void IgnoreMetadata_ControlsWhetherIccpIsParsed<TPixel>(TestImageProvider
             }
         }
 
+        [Theory]
+        [InlineData(WebpFileFormatType.Lossy)]
+        [InlineData(WebpFileFormatType.Lossless)]
+        public void Encode_WritesExifWithPadding(WebpFileFormatType fileFormatType)
+        {
+            // arrange
+            using var input = new Image<Rgba32>(25, 25);
+            using var memoryStream = new MemoryStream();
+            var expectedExif = new ExifProfile();
+            string expectedSoftware = "ImageSharp";
+            expectedExif.SetValue(ExifTag.Software, expectedSoftware);
+            input.Metadata.ExifProfile = expectedExif;
+
+            // act
+            input.Save(memoryStream, new WebpEncoder() { FileFormat = fileFormatType });
+            memoryStream.Position = 0;
+
+            // assert
+            using var image = Image.Load<Rgba32>(memoryStream);
+            ExifProfile actualExif = image.Metadata.ExifProfile;
+            Assert.NotNull(actualExif);
+            Assert.Equal(expectedExif.Values.Count, actualExif.Values.Count);
+            Assert.Equal(expectedSoftware, actualExif.GetValue(ExifTag.Software).Value);
+        }
+
         [Theory]
         [WithFile(TestImages.Webp.Lossy.WithExif, PixelTypes.Rgba32)]
         public void EncodeLossyWebp_PreservesExif<TPixel>(TestImageProvider<TPixel> provider)

From 7f3c8ffbd0ed8c41e801a361113ee05c40d3c38c Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 29 Oct 2021 19:45:46 +0200
Subject: [PATCH 07/85] Make sure the alpha flag in VP8X and VP8L are the same

---
 .../Formats/Webp/BitWriter/BitWriterBase.cs    | 18 ++++++++----------
 .../Formats/Webp/BitWriter/Vp8BitWriter.cs     | 17 ++++++++++++-----
 .../Formats/Webp/BitWriter/Vp8LBitWriter.cs    | 13 ++++++++++---
 .../Formats/Webp/Lossless/Vp8LEncoder.cs       |  2 +-
 .../Formats/Webp/Lossy/Vp8Encoder.cs           |  4 +++-
 5 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
index 31e636b6bc..9208881360 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
@@ -63,15 +63,6 @@ internal abstract class BitWriterBase
         /// </summary>
         public abstract void Finish();
 
-        /// <summary>
-        /// Writes the encoded image to the stream.
-        /// </summary>
-        /// <param name="stream">The stream to write to.</param>
-        /// <param name="exifProfile">The exif profile.</param>
-        /// <param name="width">The width of the image.</param>
-        /// <param name="height">The height of the image.</param>
-        public abstract void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, uint width, uint height);
-
         protected void ResizeBuffer(int maxBytes, int sizeRequired)
         {
             int newSize = (3 * maxBytes) >> 1;
@@ -142,7 +133,8 @@ protected void WriteExifProfile(Stream stream, byte[] exifBytes)
         /// <param name="exifProfile">A exif profile or null, if it does not exist.</param>
         /// <param name="width">The width of the image.</param>
         /// <param name="height">The height of the image.</param>
-        protected void WriteVp8XHeader(Stream stream, ExifProfile exifProfile, uint width, uint height)
+        /// <param name="hasAlpha">Flag indicating, if a alpha channel is present.</param>
+        protected void WriteVp8XHeader(Stream stream, ExifProfile exifProfile, uint width, uint height, bool hasAlpha)
         {
             if (width > MaxDimension || height > MaxDimension)
             {
@@ -162,6 +154,12 @@ protected void WriteVp8XHeader(Stream stream, ExifProfile exifProfile, uint widt
                 flags |= 8;
             }
 
+            if (hasAlpha)
+            {
+                // Set alpha bit.
+                flags |= 16;
+            }
+
             Span<byte> buf = this.scratchBuffer.AsSpan(0, 4);
             stream.Write(WebpConstants.Vp8XMagicBytes);
             BinaryPrimitives.WriteUInt32LittleEndian(buf, WebpConstants.Vp8XChunkSize);
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
index 2c943f64f0..3b2f943db5 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
@@ -399,8 +399,15 @@ private void Flush()
             }
         }
 
-        /// <inheritdoc/>
-        public override void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, uint width, uint height)
+        /// <summary>
+        /// Writes the encoded image to the stream.
+        /// </summary>
+        /// <param name="stream">The stream to write to.</param>
+        /// <param name="exifProfile">The exif profile.</param>
+        /// <param name="width">The width of the image.</param>
+        /// <param name="height">The height of the image.</param>
+        /// <param name="hasAlpha">Flag indicating, if a alpha channel is present.</param>
+        public void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, uint width, uint height, bool hasAlpha)
         {
             bool isVp8X = false;
             byte[] exifBytes = null;
@@ -433,7 +440,7 @@ public override void WriteEncodedImageToStream(Stream stream, ExifProfile exifPr
             riffSize += WebpConstants.TagSize + WebpConstants.ChunkHeaderSize + vp8Size;
 
             // Emit headers and partition #0
-            this.WriteWebpHeaders(stream, size0, vp8Size, riffSize, isVp8X, width, height, exifProfile);
+            this.WriteWebpHeaders(stream, size0, vp8Size, riffSize, isVp8X, width, height, exifProfile, hasAlpha);
             bitWriterPartZero.WriteToStream(stream);
 
             // Write the encoded image to the stream.
@@ -616,14 +623,14 @@ private void CodeIntraModes(Vp8BitWriter bitWriter)
             while (it.Next());
         }
 
-        private void WriteWebpHeaders(Stream stream, uint size0, uint vp8Size, uint riffSize, bool isVp8X, uint width, uint height, ExifProfile exifProfile)
+        private void WriteWebpHeaders(Stream stream, uint size0, uint vp8Size, uint riffSize, bool isVp8X, uint width, uint height, ExifProfile exifProfile, bool hasAlpha)
         {
             this.WriteRiffHeader(stream, riffSize);
 
             // Write VP8X, header if necessary.
             if (isVp8X)
             {
-                this.WriteVp8XHeader(stream, exifProfile, width, height);
+                this.WriteVp8XHeader(stream, exifProfile, width, height, hasAlpha);
             }
 
             this.WriteVp8Header(stream, vp8Size);
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs b/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs
index 2ce2f5550c..b83865aa36 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/Vp8LBitWriter.cs
@@ -127,8 +127,15 @@ public override void Finish()
             this.used = 0;
         }
 
-        /// <inheritdoc/>
-        public override void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, uint width, uint height)
+        /// <summary>
+        /// Writes the encoded image to the stream.
+        /// </summary>
+        /// <param name="stream">The stream to write to.</param>
+        /// <param name="exifProfile">The exif profile.</param>
+        /// <param name="width">The width of the image.</param>
+        /// <param name="height">The height of the image.</param>
+        /// <param name="hasAlpha">Flag indicating, if a alpha channel is present.</param>
+        public void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, uint width, uint height, bool hasAlpha)
         {
             bool isVp8X = false;
             byte[] exifBytes = null;
@@ -153,7 +160,7 @@ public override void WriteEncodedImageToStream(Stream stream, ExifProfile exifPr
             // Write VP8X, header if necessary.
             if (isVp8X)
             {
-                this.WriteVp8XHeader(stream, exifProfile, width, height);
+                this.WriteVp8XHeader(stream, exifProfile, width, height, hasAlpha);
             }
 
             // Write magic bytes indicating its a lossless webp.
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index 693585637c..2fb3fbc6aa 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -234,7 +234,7 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream)
             this.EncodeStream(image);
 
             // Write bytes from the bitwriter buffer to the stream.
-            this.bitWriter.WriteEncodedImageToStream(stream, image.Metadata.ExifProfile, (uint)width, (uint)height);
+            this.bitWriter.WriteEncodedImageToStream(stream, image.Metadata.ExifProfile, (uint)width, (uint)height, hasAlpha);
         }
 
         /// <summary>
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 37808d56c2..d41da790b3 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -317,6 +317,8 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream)
             this.bitWriter = new Vp8BitWriter(expectedSize, this);
 
             // TODO: EncodeAlpha();
+            bool hasAlpha = false;
+
             // Stats-collection loop.
             this.StatLoop(width, height, yStride, uvStride);
             it.Init();
@@ -348,7 +350,7 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream)
 
             // Write bytes from the bitwriter buffer to the stream.
             image.Metadata.SyncProfiles();
-            this.bitWriter.WriteEncodedImageToStream(stream, image.Metadata.ExifProfile, (uint)width, (uint)height);
+            this.bitWriter.WriteEncodedImageToStream(stream, image.Metadata.ExifProfile, (uint)width, (uint)height, hasAlpha);
         }
 
         /// <inheritdoc/>

From 70c99d3d02369d4584d18e64393e239a5f86e30b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 31 Oct 2021 13:17:32 +0100
Subject: [PATCH 08/85] Reduce allocations

---
 .../Webp/Lossless/BackwardReferenceEncoder.cs |  10 +-
 .../Formats/Webp/Lossless/HistogramEncoder.cs |  41 ++++--
 .../Formats/Webp/Lossless/HuffmanTree.cs      |   9 +-
 .../Formats/Webp/Lossless/LosslessUtils.cs    |   2 +-
 .../Formats/Webp/Lossless/PixOrCopy.cs        |   6 +-
 .../Formats/Webp/Lossless/PredictorEncoder.cs | 123 +++++++++++++-----
 .../Formats/Webp/Lossless/Vp8LEncoder.cs      |  29 ++++-
 .../Formats/Webp/Lossless/Vp8LHistogram.cs    |  57 ++++----
 .../Formats/Webp/Lossless/Vp8LStreaks.cs      |   9 ++
 .../Webp/Lossless/WebpLosslessDecoder.cs      |   3 +-
 .../Formats/Webp/Lossy/LossyUtils.cs          |  51 ++++----
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs |  86 +++++++-----
 .../Formats/Webp/Lossy/Vp8EncIterator.cs      |  27 ++--
 .../Formats/Webp/Lossy/Vp8Encoder.cs          |  18 ++-
 .../Formats/Webp/Lossy/Vp8Encoding.cs         |  54 ++++----
 .../Formats/Webp/Lossy/Vp8Histogram.cs        |  23 ++--
 .../Formats/Webp/Lossy/Vp8ModeScore.cs        |  18 +++
 .../Formats/Webp/Lossy/Vp8Residual.cs         |   5 +-
 .../Formats/Webp/Lossy/WebpLossyDecoder.cs    |  30 +++--
 .../Formats/WebP/PredictorEncoderTests.cs     |   6 +-
 20 files changed, 390 insertions(+), 217 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
index 70c4efb990..dc546f8ac2 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
@@ -49,6 +49,8 @@ public static Vp8LBackwardRefs GetBackwardReferences(
             double bitCostBest = -1;
             int cacheBitsInitial = cacheBits;
             Vp8LHashChain hashChainBox = null;
+            var stats = new Vp8LStreaks();
+            var bitsEntropy = new Vp8LBitEntropy();
             for (int lz77Type = 1; lz77TypesToTry > 0; lz77TypesToTry &= ~lz77Type, lz77Type <<= 1)
             {
                 int cacheBitsTmp = cacheBitsInitial;
@@ -81,7 +83,7 @@ public static Vp8LBackwardRefs GetBackwardReferences(
 
                 // Keep the best backward references.
                 var histo = new Vp8LHistogram(worst, cacheBitsTmp);
-                double bitCost = histo.EstimateBits();
+                double bitCost = histo.EstimateBits(stats, bitsEntropy);
 
                 if (lz77TypeBest == 0 || bitCost < bitCostBest)
                 {
@@ -100,7 +102,7 @@ public static Vp8LBackwardRefs GetBackwardReferences(
                 Vp8LHashChain hashChainTmp = lz77TypeBest == (int)Vp8LLz77Type.Lz77Standard ? hashChain : hashChainBox;
                 BackwardReferencesTraceBackwards(width, height, bgra, cacheBits, hashChainTmp, best, worst);
                 var histo = new Vp8LHistogram(worst, cacheBits);
-                double bitCostTrace = histo.EstimateBits();
+                double bitCostTrace = histo.EstimateBits(stats, bitsEntropy);
                 if (bitCostTrace < bitCostBest)
                 {
                     best = worst;
@@ -214,9 +216,11 @@ private static int CalculateBestCacheSize(ReadOnlySpan<uint> bgra, int quality,
                 }
             }
 
+            var stats = new Vp8LStreaks();
+            var bitsEntropy = new Vp8LBitEntropy();
             for (int i = 0; i <= cacheBitsMax; i++)
             {
-                double entropy = histos[i].EstimateBits();
+                double entropy = histos[i].EstimateBits(stats, bitsEntropy);
                 if (i == 0 || entropy < entropyMin)
                 {
                     entropyMin = entropy;
diff --git a/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs
index f2d4fb189f..5d407d73c1 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs
@@ -152,10 +152,12 @@ private static void HistogramAnalyzeEntropyBin(List<Vp8LHistogram> histograms, u
 
         private static int HistogramCopyAndAnalyze(List<Vp8LHistogram> origHistograms, List<Vp8LHistogram> histograms, ushort[] histogramSymbols)
         {
+            var stats = new Vp8LStreaks();
+            var bitsEntropy = new Vp8LBitEntropy();
             for (int clusterId = 0, i = 0; i < origHistograms.Count; i++)
             {
                 Vp8LHistogram origHistogram = origHistograms[i];
-                origHistogram.UpdateHistogramCost();
+                origHistogram.UpdateHistogramCost(stats, bitsEntropy);
 
                 // Skip the histogram if it is completely empty, which can happen for tiles with no information (when they are skipped because of LZ77).
                 if (!origHistogram.IsUsed[0] && !origHistogram.IsUsed[1] && !origHistogram.IsUsed[2] && !origHistogram.IsUsed[3] && !origHistogram.IsUsed[4])
@@ -175,7 +177,14 @@ private static int HistogramCopyAndAnalyze(List<Vp8LHistogram> origHistograms, L
             return numUsed;
         }
 
-        private static void HistogramCombineEntropyBin(List<Vp8LHistogram> histograms, ushort[] clusters, ushort[] clusterMappings, Vp8LHistogram curCombo, ushort[] binMap, int numBins, double combineCostFactor)
+        private static void HistogramCombineEntropyBin(
+            List<Vp8LHistogram> histograms,
+            ushort[] clusters,
+            ushort[] clusterMappings,
+            Vp8LHistogram curCombo,
+            ushort[] binMap,
+            int numBins,
+            double combineCostFactor)
         {
             var binInfo = new HistogramBinInfo[BinSize];
             for (int idx = 0; idx < numBins; idx++)
@@ -191,6 +200,8 @@ private static void HistogramCombineEntropyBin(List<Vp8LHistogram> histograms, u
             }
 
             var indicesToRemove = new List<int>();
+            var stats = new Vp8LStreaks();
+            var bitsEntropy = new Vp8LBitEntropy();
             for (int idx = 0; idx < histograms.Count; idx++)
             {
                 if (histograms[idx] == null)
@@ -209,7 +220,7 @@ private static void HistogramCombineEntropyBin(List<Vp8LHistogram> histograms, u
                     // Try to merge #idx into #first (both share the same binId)
                     double bitCost = histograms[idx].BitCost;
                     double bitCostThresh = -bitCost * combineCostFactor;
-                    double currCostDiff = histograms[first].AddEval(histograms[idx], bitCostThresh, curCombo);
+                    double currCostDiff = histograms[first].AddEval(histograms[idx], stats, bitsEntropy, bitCostThresh, curCombo);
 
                     if (currCostDiff < bitCostThresh)
                     {
@@ -308,6 +319,8 @@ private static bool HistogramCombineStochastic(List<Vp8LHistogram> histograms, i
             int numUsed = histograms.Count(h => h != null);
             int outerIters = numUsed;
             int numTriesNoSuccess = outerIters / 2;
+            var stats = new Vp8LStreaks();
+            var bitsEntropy = new Vp8LBitEntropy();
 
             if (numUsed < minClusterSize)
             {
@@ -354,7 +367,7 @@ private static bool HistogramCombineStochastic(List<Vp8LHistogram> histograms, i
                     idx2 = mappings[idx2];
 
                     // Calculate cost reduction on combination.
-                    double currCost = HistoPriorityListPush(histoPriorityList, maxSize, histograms, idx1, idx2, bestCost);
+                    double currCost = HistoPriorityListPush(histoPriorityList, maxSize, histograms, idx1, idx2, bestCost, stats, bitsEntropy);
 
                     // Found a better pair?
                     if (currCost < 0)
@@ -428,7 +441,7 @@ private static bool HistogramCombineStochastic(List<Vp8LHistogram> histograms, i
                     if (doEval)
                     {
                         // Re-evaluate the cost of an updated pair.
-                        HistoListUpdatePair(histograms[p.Idx1], histograms[p.Idx2], 0.0d, p);
+                        HistoListUpdatePair(histograms[p.Idx1], histograms[p.Idx2], stats, bitsEntropy, 0.0d, p);
                         if (p.CostDiff >= 0.0d)
                         {
                             histoPriorityList[j] = histoPriorityList[histoPriorityList.Count - 1];
@@ -456,6 +469,8 @@ private static void HistogramCombineGreedy(List<Vp8LHistogram> histograms)
             // Priority list of histogram pairs.
             var histoPriorityList = new List<HistogramPair>();
             int maxSize = histoSize * histoSize;
+            var stats = new Vp8LStreaks();
+            var bitsEntropy = new Vp8LBitEntropy();
 
             for (int i = 0; i < histoSize; i++)
             {
@@ -471,7 +486,7 @@ private static void HistogramCombineGreedy(List<Vp8LHistogram> histograms)
                         continue;
                     }
 
-                    HistoPriorityListPush(histoPriorityList, maxSize, histograms, i, j, 0.0d);
+                    HistoPriorityListPush(histoPriorityList, maxSize, histograms, i, j, 0.0d, stats, bitsEntropy);
                 }
             }
 
@@ -510,7 +525,7 @@ private static void HistogramCombineGreedy(List<Vp8LHistogram> histograms)
                         continue;
                     }
 
-                    HistoPriorityListPush(histoPriorityList, maxSize, histograms, idx1, i, 0.0d);
+                    HistoPriorityListPush(histoPriorityList, maxSize, histograms, idx1, i, 0.0d, stats, bitsEntropy);
                 }
             }
         }
@@ -519,6 +534,8 @@ private static void HistogramRemap(List<Vp8LHistogram> input, List<Vp8LHistogram
         {
             int inSize = input.Count;
             int outSize = output.Count;
+            var stats = new Vp8LStreaks();
+            var bitsEntropy = new Vp8LBitEntropy();
             if (outSize > 1)
             {
                 for (int i = 0; i < inSize; i++)
@@ -534,7 +551,7 @@ private static void HistogramRemap(List<Vp8LHistogram> input, List<Vp8LHistogram
                     double bestBits = double.MaxValue;
                     for (int k = 0; k < outSize; k++)
                     {
-                        double curBits = output[k].AddThresh(input[i], bestBits);
+                        double curBits = output[k].AddThresh(input[i], stats, bitsEntropy, bestBits);
                         if (k == 0 || curBits < bestBits)
                         {
                             bestBits = curBits;
@@ -577,7 +594,7 @@ private static void HistogramRemap(List<Vp8LHistogram> input, List<Vp8LHistogram
         /// Create a pair from indices "idx1" and "idx2" provided its cost is inferior to "threshold", a negative entropy.
         /// </summary>
         /// <returns>The cost of the pair, or 0 if it superior to threshold.</returns>
-        private static double HistoPriorityListPush(List<HistogramPair> histoList, int maxSize, List<Vp8LHistogram> histograms, int idx1, int idx2, double threshold)
+        private static double HistoPriorityListPush(List<HistogramPair> histoList, int maxSize, List<Vp8LHistogram> histograms, int idx1, int idx2, double threshold, Vp8LStreaks stats, Vp8LBitEntropy bitsEntropy)
         {
             var pair = new HistogramPair();
 
@@ -598,7 +615,7 @@ private static double HistoPriorityListPush(List<HistogramPair> histoList, int m
             Vp8LHistogram h1 = histograms[idx1];
             Vp8LHistogram h2 = histograms[idx2];
 
-            HistoListUpdatePair(h1, h2, threshold, pair);
+            HistoListUpdatePair(h1, h2, stats, bitsEntropy, threshold, pair);
 
             // Do not even consider the pair if it does not improve the entropy.
             if (pair.CostDiff >= threshold)
@@ -616,11 +633,11 @@ private static double HistoPriorityListPush(List<HistogramPair> histoList, int m
         /// <summary>
         /// Update the cost diff and combo of a pair of histograms. This needs to be called when the the histograms have been merged with a third one.
         /// </summary>
-        private static void HistoListUpdatePair(Vp8LHistogram h1, Vp8LHistogram h2, double threshold, HistogramPair pair)
+        private static void HistoListUpdatePair(Vp8LHistogram h1, Vp8LHistogram h2, Vp8LStreaks stats, Vp8LBitEntropy bitsEntropy, double threshold, HistogramPair pair)
         {
             double sumCost = h1.BitCost + h2.BitCost;
             pair.CostCombo = 0.0d;
-            h1.GetCombinedHistogramEntropy(h2, sumCost + threshold, costInitial: pair.CostCombo, out double cost);
+            h1.GetCombinedHistogramEntropy(h2, stats, bitsEntropy, sumCost + threshold, costInitial: pair.CostCombo, out double cost);
             pair.CostCombo = cost;
             pair.CostDiff = pair.CostCombo - sumCost;
         }
diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
index cd8be9aac3..0376311ed9 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
@@ -49,14 +49,13 @@ public static int Compare(HuffmanTree t1, HuffmanTree t2)
             {
                 return -1;
             }
-            else if (t1.TotalCount < t2.TotalCount)
+
+            if (t1.TotalCount < t2.TotalCount)
             {
                 return 1;
             }
-            else
-            {
-                return t1.Value < t2.Value ? -1 : 1;
-            }
+
+            return t1.Value < t2.Value ? -1 : 1;
         }
 
         public IDeepCloneable DeepClone() => new HuffmanTree(this);
diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index b7f94415be..06204ae913 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -704,7 +704,7 @@ public static void BundleColorMap(Span<byte> row, int width, int xBits, Span<uin
         /// Compute the combined Shanon's entropy for distribution {X} and {X+Y}.
         /// </summary>
         /// <returns>Shanon entropy.</returns>
-        public static float CombinedShannonEntropy(int[] x, int[] y)
+        public static float CombinedShannonEntropy(Span<int> x, Span<int> y)
         {
             double retVal = 0.0d;
             uint sumX = 0, sumXY = 0;
diff --git a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
index 2d71a7af64..6cd109121d 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
@@ -15,7 +15,7 @@ internal class PixOrCopy
         public uint BgraOrDistance { get; set; }
 
         public static PixOrCopy CreateCacheIdx(int idx) =>
-            new PixOrCopy()
+            new()
             {
                 Mode = PixOrCopyMode.CacheIdx,
                 BgraOrDistance = (uint)idx,
@@ -23,14 +23,14 @@ public static PixOrCopy CreateCacheIdx(int idx) =>
             };
 
         public static PixOrCopy CreateLiteral(uint bgra) =>
-            new PixOrCopy()
+            new()
             {
                 Mode = PixOrCopyMode.Literal,
                 BgraOrDistance = bgra,
                 Len = 1
             };
 
-        public static PixOrCopy CreateCopy(uint distance, ushort len) => new PixOrCopy()
+        public static PixOrCopy CreateCopy(uint distance, ushort len) => new()
         {
             Mode = PixOrCopyMode.Copy,
             BgraOrDistance = distance,
diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
index 671e9a043e..713fc79194 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
@@ -17,6 +17,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
     /// </summary>
     internal static unsafe class PredictorEncoder
     {
+        private static readonly sbyte[] DeltaLut = { 16, 16, 8, 4, 2, 2, 2 };
+
+        private static readonly sbyte[][] Offset =
+        {
+            new sbyte[] { 0, -1 }, new sbyte[] { 0, 1 }, new sbyte[] { -1, 0 }, new sbyte[] { 1, 0 }, new sbyte[] { -1, -1 }, new sbyte[] { -1, 1 }, new sbyte[] { 1, -1 }, new sbyte[] { 1, 1 }
+        };
+
         private const int GreenRedToBlueNumAxis = 8;
 
         private const int GreenRedToBlueMaxIters = 7;
@@ -41,6 +48,8 @@ public static void ResidualImage(
             Span<uint> bgra,
             Span<uint> bgraScratch,
             Span<uint> image,
+            int[][] histoArgb,
+            int[][] bestHisto,
             bool nearLossless,
             int nearLosslessQuality,
             WebpTransparentColorMode transparentColorMode,
@@ -80,6 +89,8 @@ public static void ResidualImage(
                             histo,
                             bgraScratch,
                             bgra,
+                            histoArgb,
+                            bestHisto,
                             maxQuantization,
                             transparentColorMode,
                             usedSubtractGreen,
@@ -105,7 +116,7 @@ public static void ResidualImage(
                 lowEffort);
         }
 
-        public static void ColorSpaceTransform(int width, int height, int bits, int quality, Span<uint> bgra, Span<uint> image)
+        public static void ColorSpaceTransform(int width, int height, int bits, int quality, Span<uint> bgra, Span<uint> image, Span<int> scratch)
         {
             int maxTileSize = 1 << bits;
             int tileXSize = LosslessUtils.SubSampleSize(width, bits);
@@ -139,7 +150,8 @@ public static void ColorSpaceTransform(int width, int height, int bits, int qual
                         height,
                         accumulatedRedHisto,
                         accumulatedBlueHisto,
-                        bgra);
+                        bgra,
+                        scratch);
 
                     image[offset] = MultipliersToColorCode(prevX);
                     CopyTileWithColorTransform(width, height, tileXOffset, tileYOffset, maxTileSize, prevX, bgra);
@@ -188,6 +200,8 @@ private static int GetBestPredictorForTile(
             int[][] accumulated,
             Span<uint> argbScratch,
             Span<uint> argb,
+            int[][] histoArgb,
+            int[][] bestHisto,
             int maxQuantization,
             WebpTransparentColorMode transparentColorMode,
             bool usedSubtractGreen,
@@ -222,21 +236,14 @@ private static int GetBestPredictorForTile(
             float bestDiff = MaxDiffCost;
             int bestMode = 0;
             uint[] residuals = new uint[1 << WebpConstants.MaxTransformBits];
-            int[][] histoArgb = new int[4][];
-            int[][] bestHisto = new int[4][];
             for (int i = 0; i < 4; i++)
             {
-                histoArgb[i] = new int[256];
-                bestHisto[i] = new int[256];
+                histoArgb[i].AsSpan().Clear();
+                bestHisto[i].AsSpan().Clear();
             }
 
             for (int mode = 0; mode < numPredModes; mode++)
             {
-                for (int i = 0; i < 4; i++)
-                {
-                    histoArgb[i].AsSpan().Fill(0);
-                }
-
                 if (startY > 0)
                 {
                     // Read the row above the tile which will become the first upper_row.
@@ -300,6 +307,11 @@ private static int GetBestPredictorForTile(
                     bestDiff = curDiff;
                     bestMode = mode;
                 }
+
+                for (int i = 0; i < 4; i++)
+                {
+                    histoArgb[i].AsSpan().Clear();
+                }
             }
 
             for (int i = 0; i < 4; i++)
@@ -819,7 +831,19 @@ private static void CopyTileWithColorTransform(int xSize, int ySize, int tileX,
             }
         }
 
-        private static Vp8LMultipliers GetBestColorTransformForTile(int tileX, int tileY, int bits, Vp8LMultipliers prevX, Vp8LMultipliers prevY, int quality, int xSize, int ySize, int[] accumulatedRedHisto, int[] accumulatedBlueHisto, Span<uint> argb)
+        private static Vp8LMultipliers GetBestColorTransformForTile(
+            int tileX,
+            int tileY,
+            int bits,
+            Vp8LMultipliers prevX,
+            Vp8LMultipliers prevY,
+            int quality,
+            int xSize,
+            int ySize,
+            int[] accumulatedRedHisto,
+            int[] accumulatedBlueHisto,
+            Span<uint> argb,
+            Span<int> scratch)
         {
             int maxTileSize = 1 << bits;
             int tileYOffset = tileY * maxTileSize;
@@ -832,18 +856,28 @@ private static Vp8LMultipliers GetBestColorTransformForTile(int tileX, int tileY
 
             var bestTx = default(Vp8LMultipliers);
 
-            GetBestGreenToRed(tileArgb, xSize, tileWidth, tileHeight, prevX, prevY, quality, accumulatedRedHisto, ref bestTx);
+            GetBestGreenToRed(tileArgb, xSize, scratch, tileWidth, tileHeight, prevX, prevY, quality, accumulatedRedHisto, ref bestTx);
 
-            GetBestGreenRedToBlue(tileArgb, xSize, tileWidth, tileHeight, prevX, prevY, quality, accumulatedBlueHisto, ref bestTx);
+            GetBestGreenRedToBlue(tileArgb, xSize, scratch, tileWidth, tileHeight, prevX, prevY, quality, accumulatedBlueHisto, ref bestTx);
 
             return bestTx;
         }
 
-        private static void GetBestGreenToRed(Span<uint> argb, int stride, int tileWidth, int tileHeight, Vp8LMultipliers prevX, Vp8LMultipliers prevY, int quality, int[] accumulatedRedHisto, ref Vp8LMultipliers bestTx)
+        private static void GetBestGreenToRed(
+            Span<uint> argb,
+            int stride,
+            Span<int> scratch,
+            int tileWidth,
+            int tileHeight,
+            Vp8LMultipliers prevX,
+            Vp8LMultipliers prevY,
+            int quality,
+            int[] accumulatedRedHisto,
+            ref Vp8LMultipliers bestTx)
         {
             int maxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
             int greenToRedBest = 0;
-            double bestDiff = GetPredictionCostCrossColorRed(argb, stride, tileWidth, tileHeight, prevX, prevY, greenToRedBest, accumulatedRedHisto);
+            double bestDiff = GetPredictionCostCrossColorRed(argb, stride, scratch, tileWidth, tileHeight, prevX, prevY, greenToRedBest, accumulatedRedHisto);
             for (int iter = 0; iter < maxIters; iter++)
             {
                 // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
@@ -855,7 +889,7 @@ private static void GetBestGreenToRed(Span<uint> argb, int stride, int tileWidth
                 for (int offset = -delta; offset <= delta; offset += 2 * delta)
                 {
                     int greenToRedCur = offset + greenToRedBest;
-                    double curDiff = GetPredictionCostCrossColorRed(argb, stride, tileWidth, tileHeight, prevX, prevY, greenToRedCur, accumulatedRedHisto);
+                    double curDiff = GetPredictionCostCrossColorRed(argb, stride, scratch, tileWidth, tileHeight, prevX, prevY, greenToRedCur, accumulatedRedHisto);
                     if (curDiff < bestDiff)
                     {
                         bestDiff = curDiff;
@@ -867,24 +901,22 @@ private static void GetBestGreenToRed(Span<uint> argb, int stride, int tileWidth
             bestTx.GreenToRed = (byte)(greenToRedBest & 0xff);
         }
 
-        private static void GetBestGreenRedToBlue(Span<uint> argb, int stride, int tileWidth, int tileHeight, Vp8LMultipliers prevX, Vp8LMultipliers prevY, int quality, int[] accumulatedBlueHisto, ref Vp8LMultipliers bestTx)
+        private static void GetBestGreenRedToBlue(Span<uint> argb, int stride, Span<int> scratch, int tileWidth, int tileHeight, Vp8LMultipliers prevX, Vp8LMultipliers prevY, int quality, int[] accumulatedBlueHisto, ref Vp8LMultipliers bestTx)
         {
             int iters = (quality < 25) ? 1 : (quality > 50) ? GreenRedToBlueMaxIters : 4;
             int greenToBlueBest = 0;
             int redToBlueBest = 0;
-            sbyte[][] offset = { new sbyte[] { 0, -1 }, new sbyte[] { 0, 1 }, new sbyte[] { -1, 0 }, new sbyte[] { 1, 0 }, new sbyte[] { -1, -1 }, new sbyte[] { -1, 1 }, new sbyte[] { 1, -1 }, new sbyte[] { 1, 1 } };
-            sbyte[] deltaLut = { 16, 16, 8, 4, 2, 2, 2 };
 
             // Initial value at origin:
-            double bestDiff = GetPredictionCostCrossColorBlue(argb, stride, tileWidth, tileHeight, prevX, prevY, greenToBlueBest, redToBlueBest, accumulatedBlueHisto);
+            double bestDiff = GetPredictionCostCrossColorBlue(argb, stride, scratch, tileWidth, tileHeight, prevX, prevY, greenToBlueBest, redToBlueBest, accumulatedBlueHisto);
             for (int iter = 0; iter < iters; iter++)
             {
-                int delta = deltaLut[iter];
+                int delta = DeltaLut[iter];
                 for (int axis = 0; axis < GreenRedToBlueNumAxis; axis++)
                 {
-                    int greenToBlueCur = (offset[axis][0] * delta) + greenToBlueBest;
-                    int redToBlueCur = (offset[axis][1] * delta) + redToBlueBest;
-                    double curDiff = GetPredictionCostCrossColorBlue(argb, stride, tileWidth, tileHeight, prevX, prevY, greenToBlueCur, redToBlueCur, accumulatedBlueHisto);
+                    int greenToBlueCur = (Offset[axis][0] * delta) + greenToBlueBest;
+                    int redToBlueCur = (Offset[axis][1] * delta) + redToBlueBest;
+                    double curDiff = GetPredictionCostCrossColorBlue(argb, stride, scratch, tileWidth, tileHeight, prevX, prevY, greenToBlueCur, redToBlueCur, accumulatedBlueHisto);
                     if (curDiff < bestDiff)
                     {
                         bestDiff = curDiff;
@@ -910,9 +942,19 @@ private static void GetBestGreenRedToBlue(Span<uint> argb, int stride, int tileW
             bestTx.RedToBlue = (byte)(redToBlueBest & 0xff);
         }
 
-        private static double GetPredictionCostCrossColorRed(Span<uint> argb, int stride, int tileWidth, int tileHeight, Vp8LMultipliers prevX, Vp8LMultipliers prevY, int greenToRed, int[] accumulatedRedHisto)
+        private static double GetPredictionCostCrossColorRed(
+            Span<uint> argb,
+            int stride,
+            Span<int> scratch,
+            int tileWidth,
+            int tileHeight,
+            Vp8LMultipliers prevX,
+            Vp8LMultipliers prevY,
+            int greenToRed,
+            int[] accumulatedRedHisto)
         {
-            int[] histo = new int[256];
+            Span<int> histo = scratch.Slice(0, 256);
+            histo.Clear();
 
             CollectColorRedTransforms(argb, stride, tileWidth, tileHeight, greenToRed, histo);
             double curDiff = PredictionCostCrossColor(accumulatedRedHisto, histo);
@@ -937,9 +979,20 @@ private static double GetPredictionCostCrossColorRed(Span<uint> argb, int stride
             return curDiff;
         }
 
-        private static double GetPredictionCostCrossColorBlue(Span<uint> argb, int stride, int tileWidth, int tileHeight, Vp8LMultipliers prevX, Vp8LMultipliers prevY, int greenToBlue, int redToBlue, int[] accumulatedBlueHisto)
+        private static double GetPredictionCostCrossColorBlue(
+            Span<uint> argb,
+            int stride,
+            Span<int> scratch,
+            int tileWidth,
+            int tileHeight,
+            Vp8LMultipliers prevX,
+            Vp8LMultipliers prevY,
+            int greenToBlue,
+            int redToBlue,
+            int[] accumulatedBlueHisto)
         {
-            int[] histo = new int[256];
+            Span<int> histo = scratch.Slice(0, 256);
+            histo.Clear();
 
             CollectColorBlueTransforms(argb, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
             double curDiff = PredictionCostCrossColor(accumulatedBlueHisto, histo);
@@ -980,7 +1033,7 @@ private static double GetPredictionCostCrossColorBlue(Span<uint> argb, int strid
             return curDiff;
         }
 
-        private static void CollectColorRedTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, int[] histo)
+        private static void CollectColorRedTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse41.IsSupported)
@@ -1036,7 +1089,7 @@ private static void CollectColorRedTransforms(Span<uint> bgra, int stride, int t
             }
         }
 
-        private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, int[] histo)
+        private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
         {
             int pos = 0;
             while (tileHeight-- > 0)
@@ -1051,7 +1104,7 @@ private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int
             }
         }
 
-        private static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, int[] histo)
+        private static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse41.IsSupported)
@@ -1114,7 +1167,7 @@ private static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int
             }
         }
 
-        private static void CollectColorBlueTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, int[] histo)
+        private static void CollectColorBlueTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
         {
             int pos = 0;
             while (tileHeight-- > 0)
@@ -1143,7 +1196,7 @@ private static float PredictionCostSpatialHistogram(int[][] accumulated, int[][]
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private static double PredictionCostCrossColor(int[] accumulated, int[] counts)
+        private static double PredictionCostCrossColor(int[] accumulated, Span<int> counts)
         {
             // Favor low entropy, locally and globally.
             // Favor small absolute values for PredictionCostSpatial.
@@ -1152,7 +1205,7 @@ private static double PredictionCostCrossColor(int[] accumulated, int[] counts)
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private static float PredictionCostSpatial(int[] counts, int weight0, double expVal)
+        private static float PredictionCostSpatial(Span<int> counts, int weight0, double expVal)
         {
             int significantSymbols = 256 >> 4;
             double expDecayFactor = 0.6;
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index 693585637c..818488696e 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -19,6 +19,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
     /// </summary>
     internal class Vp8LEncoder : IDisposable
     {
+        /// <summary>
+        /// Scratch buffer to reduce allocations.
+        /// </summary>
+        private readonly int[] scratch = new int[256];
+
+        private int[][] histoArgb = { new int[256], new int[256], new int[256], new int[256] };
+
+        private int[][] bestHisto = { new int[256], new int[256], new int[256], new int[256] };
+
         /// <summary>
         /// The <see cref="MemoryAllocator"/> to use for buffer allocations.
         /// </summary>
@@ -76,6 +85,8 @@ internal class Vp8LEncoder : IDisposable
 
         private const int PaletteInvSize = 1 << PaletteInvSizeBits;
 
+        private static readonly byte[] Order = { 1, 2, 0, 3 };
+
         /// <summary>
         /// Initializes a new instance of the <see cref="Vp8LEncoder"/> class.
         /// </summary>
@@ -675,6 +686,8 @@ private void ApplyPredictFilter(int width, int height, bool lowEffort)
                 this.EncodedData.GetSpan(),
                 this.BgraScratch.GetSpan(),
                 this.TransformData.GetSpan(),
+                this.histoArgb,
+                this.bestHisto,
                 this.nearLossless,
                 nearLosslessStrength,
                 this.transparentColorMode,
@@ -694,7 +707,7 @@ private void ApplyCrossColorFilter(int width, int height, bool lowEffort)
             int transformWidth = LosslessUtils.SubSampleSize(width, colorTransformBits);
             int transformHeight = LosslessUtils.SubSampleSize(height, colorTransformBits);
 
-            PredictorEncoder.ColorSpaceTransform(width, height, colorTransformBits, this.quality, this.EncodedData.GetSpan(), this.TransformData.GetSpan());
+            PredictorEncoder.ColorSpaceTransform(width, height, colorTransformBits, this.quality, this.EncodedData.GetSpan(), this.TransformData.GetSpan(), this.scratch);
 
             this.bitWriter.PutBits(WebpConstants.TransformPresent, 1);
             this.bitWriter.PutBits((uint)Vp8LTransformType.CrossColorTransform, 2);
@@ -736,7 +749,7 @@ private void EncodeImageNoHuffman(Span<uint> bgra, Vp8LHashChain hashChain, Vp8L
 
             var histogramImage = new List<Vp8LHistogram>()
             {
-                new Vp8LHistogram(cacheBits)
+                new(cacheBits)
             };
 
             // Build histogram image and symbols from backward references.
@@ -780,7 +793,8 @@ private void EncodeImageNoHuffman(Span<uint> bgra, Vp8LHashChain hashChain, Vp8L
         private void StoreHuffmanCode(HuffmanTree[] huffTree, HuffmanTreeToken[] tokens, HuffmanTreeCode huffmanCode)
         {
             int count = 0;
-            int[] symbols = { 0, 0 };
+            Span<int> symbols = this.scratch.AsSpan(0, 2);
+            symbols.Clear();
             int maxBits = 8;
             int maxSymbol = 1 << maxBits;
 
@@ -973,10 +987,9 @@ private void StoreImageToBitMask(int width, int histoBits, Vp8LBackwardRefs back
 
                 if (v.IsLiteral())
                 {
-                    byte[] order = { 1, 2, 0, 3 };
                     for (int k = 0; k < 4; k++)
                     {
-                        int code = (int)v.Literal(order[k]);
+                        int code = (int)v.Literal(Order[k]);
                         this.bitWriter.WriteHuffmanCode(codes[k], code);
                     }
                 }
@@ -1092,9 +1105,10 @@ private EntropyIx AnalyzeEntropy(ReadOnlySpan<uint> bgra, int width, int height,
             histo[(int)HistoIx.HistoBluePred * 256]++;
             histo[(int)HistoIx.HistoAlphaPred * 256]++;
 
+            var bitEntropy = new Vp8LBitEntropy();
             for (int j = 0; j < (int)HistoIx.HistoTotal; j++)
             {
-                var bitEntropy = new Vp8LBitEntropy();
+                bitEntropy.Init();
                 Span<uint> curHisto = histo.Slice(j * 256, 256);
                 bitEntropy.BitsEntropyUnrefined(curHisto, 256);
                 entropyComp[j] = bitEntropy.BitsEntropyRefine();
@@ -1447,7 +1461,8 @@ private static int SearchColorNoIdx(uint[] sorted, uint color, int hi)
                 {
                     return mid;
                 }
-                else if (sorted[mid] < color)
+
+                if (sorted[mid] < color)
                 {
                     low = mid;
                 }
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
index 42260e2b25..8b02015687 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
@@ -157,29 +157,30 @@ public void AddSinglePixOrCopy(PixOrCopy v, bool useDistanceModifier, int xSize
         /// Estimate how many bits the combined entropy of literals and distance approximately maps to.
         /// </summary>
         /// <returns>Estimated bits.</returns>
-        public double EstimateBits()
+        public double EstimateBits(Vp8LStreaks stats, Vp8LBitEntropy bitsEntropy)
         {
             uint notUsed = 0;
             return
-                PopulationCost(this.Literal, this.NumCodes(), ref notUsed, ref this.IsUsed[0])
-                + PopulationCost(this.Red, WebpConstants.NumLiteralCodes, ref notUsed, ref this.IsUsed[1])
-                + PopulationCost(this.Blue, WebpConstants.NumLiteralCodes, ref notUsed, ref this.IsUsed[2])
-                + PopulationCost(this.Alpha, WebpConstants.NumLiteralCodes, ref notUsed, ref this.IsUsed[3])
-                + PopulationCost(this.Distance, WebpConstants.NumDistanceCodes, ref notUsed, ref this.IsUsed[4])
+                PopulationCost(this.Literal, this.NumCodes(), ref notUsed, ref this.IsUsed[0], stats, bitsEntropy)
+                + PopulationCost(this.Red, WebpConstants.NumLiteralCodes, ref notUsed, ref this.IsUsed[1], stats, bitsEntropy)
+                + PopulationCost(this.Blue, WebpConstants.NumLiteralCodes, ref notUsed, ref this.IsUsed[2], stats, bitsEntropy)
+                + PopulationCost(this.Alpha, WebpConstants.NumLiteralCodes, ref notUsed, ref this.IsUsed[3], stats, bitsEntropy)
+                + PopulationCost(this.Distance, WebpConstants.NumDistanceCodes, ref notUsed, ref this.IsUsed[4], stats, bitsEntropy)
                 + ExtraCost(this.Literal.AsSpan(WebpConstants.NumLiteralCodes), WebpConstants.NumLengthCodes)
                 + ExtraCost(this.Distance, WebpConstants.NumDistanceCodes);
         }
 
-        public void UpdateHistogramCost()
+        public void UpdateHistogramCost(Vp8LStreaks stats, Vp8LBitEntropy bitsEntropy)
         {
             uint alphaSym = 0, redSym = 0, blueSym = 0;
             uint notUsed = 0;
-            double alphaCost = PopulationCost(this.Alpha, WebpConstants.NumLiteralCodes, ref alphaSym, ref this.IsUsed[3]);
-            double distanceCost = PopulationCost(this.Distance, WebpConstants.NumDistanceCodes, ref notUsed, ref this.IsUsed[4]) + ExtraCost(this.Distance, WebpConstants.NumDistanceCodes);
+
+            double alphaCost = PopulationCost(this.Alpha, WebpConstants.NumLiteralCodes, ref alphaSym, ref this.IsUsed[3], stats, bitsEntropy);
+            double distanceCost = PopulationCost(this.Distance, WebpConstants.NumDistanceCodes, ref notUsed, ref this.IsUsed[4], stats, bitsEntropy) + ExtraCost(this.Distance, WebpConstants.NumDistanceCodes);
             int numCodes = this.NumCodes();
-            this.LiteralCost = PopulationCost(this.Literal, numCodes, ref notUsed, ref this.IsUsed[0]) + ExtraCost(this.Literal.AsSpan(WebpConstants.NumLiteralCodes), WebpConstants.NumLengthCodes);
-            this.RedCost = PopulationCost(this.Red, WebpConstants.NumLiteralCodes, ref redSym, ref this.IsUsed[1]);
-            this.BlueCost = PopulationCost(this.Blue, WebpConstants.NumLiteralCodes, ref blueSym, ref this.IsUsed[2]);
+            this.LiteralCost = PopulationCost(this.Literal, numCodes, ref notUsed, ref this.IsUsed[0], stats, bitsEntropy) + ExtraCost(this.Literal.AsSpan(WebpConstants.NumLiteralCodes), WebpConstants.NumLengthCodes);
+            this.RedCost = PopulationCost(this.Red, WebpConstants.NumLiteralCodes, ref redSym, ref this.IsUsed[1], stats, bitsEntropy);
+            this.BlueCost = PopulationCost(this.Blue, WebpConstants.NumLiteralCodes, ref blueSym, ref this.IsUsed[2], stats, bitsEntropy);
             this.BitCost = this.LiteralCost + this.RedCost + this.BlueCost + alphaCost + distanceCost;
             if ((alphaSym | redSym | blueSym) == NonTrivialSym)
             {
@@ -198,11 +199,11 @@ public void UpdateHistogramCost()
         /// Since the previous score passed is 'costThreshold', we only need to compare
         /// the partial cost against 'costThreshold + C(a) + C(b)' to possibly bail-out early.
         /// </summary>
-        public double AddEval(Vp8LHistogram b, double costThreshold, Vp8LHistogram output)
+        public double AddEval(Vp8LHistogram b, Vp8LStreaks stats, Vp8LBitEntropy bitsEntropy, double costThreshold, Vp8LHistogram output)
         {
             double sumCost = this.BitCost + b.BitCost;
             costThreshold += sumCost;
-            if (this.GetCombinedHistogramEntropy(b, costThreshold, costInitial: 0, out double cost))
+            if (this.GetCombinedHistogramEntropy(b, stats, bitsEntropy, costThreshold, costInitial: 0, out double cost))
             {
                 this.Add(b, output);
                 output.BitCost = cost;
@@ -212,10 +213,10 @@ public double AddEval(Vp8LHistogram b, double costThreshold, Vp8LHistogram outpu
             return cost - sumCost;
         }
 
-        public double AddThresh(Vp8LHistogram b, double costThreshold)
+        public double AddThresh(Vp8LHistogram b, Vp8LStreaks stats, Vp8LBitEntropy bitsEntropy, double costThreshold)
         {
             double costInitial = -this.BitCost;
-            this.GetCombinedHistogramEntropy(b, costThreshold, costInitial, out double cost);
+            this.GetCombinedHistogramEntropy(b, stats, bitsEntropy, costThreshold, costInitial, out double cost);
             return cost;
         }
 
@@ -239,12 +240,12 @@ public void Add(Vp8LHistogram b, Vp8LHistogram output)
                 : NonTrivialSym;
         }
 
-        public bool GetCombinedHistogramEntropy(Vp8LHistogram b, double costThreshold, double costInitial, out double cost)
+        public bool GetCombinedHistogramEntropy(Vp8LHistogram b, Vp8LStreaks stats, Vp8LBitEntropy bitEntropy, double costThreshold, double costInitial, out double cost)
         {
             bool trivialAtEnd = false;
             cost = costInitial;
 
-            cost += GetCombinedEntropy(this.Literal, b.Literal, this.NumCodes(), this.IsUsed[0], b.IsUsed[0], false);
+            cost += GetCombinedEntropy(this.Literal, b.Literal, this.NumCodes(), this.IsUsed[0], b.IsUsed[0], false, stats, bitEntropy);
 
             cost += ExtraCostCombined(this.Literal.AsSpan(WebpConstants.NumLiteralCodes), b.Literal.AsSpan(WebpConstants.NumLiteralCodes), WebpConstants.NumLengthCodes);
 
@@ -267,25 +268,25 @@ public bool GetCombinedHistogramEntropy(Vp8LHistogram b, double costThreshold, d
                 }
             }
 
-            cost += GetCombinedEntropy(this.Red, b.Red, WebpConstants.NumLiteralCodes, this.IsUsed[1], b.IsUsed[1], trivialAtEnd);
+            cost += GetCombinedEntropy(this.Red, b.Red, WebpConstants.NumLiteralCodes, this.IsUsed[1], b.IsUsed[1], trivialAtEnd, stats, bitEntropy);
             if (cost > costThreshold)
             {
                 return false;
             }
 
-            cost += GetCombinedEntropy(this.Blue, b.Blue, WebpConstants.NumLiteralCodes, this.IsUsed[2], b.IsUsed[2], trivialAtEnd);
+            cost += GetCombinedEntropy(this.Blue, b.Blue, WebpConstants.NumLiteralCodes, this.IsUsed[2], b.IsUsed[2], trivialAtEnd, stats, bitEntropy);
             if (cost > costThreshold)
             {
                 return false;
             }
 
-            cost += GetCombinedEntropy(this.Alpha, b.Alpha, WebpConstants.NumLiteralCodes, this.IsUsed[3], b.IsUsed[3], trivialAtEnd);
+            cost += GetCombinedEntropy(this.Alpha, b.Alpha, WebpConstants.NumLiteralCodes, this.IsUsed[3], b.IsUsed[3], trivialAtEnd, stats, bitEntropy);
             if (cost > costThreshold)
             {
                 return false;
             }
 
-            cost += GetCombinedEntropy(this.Distance, b.Distance, WebpConstants.NumDistanceCodes, this.IsUsed[4], b.IsUsed[4], false);
+            cost += GetCombinedEntropy(this.Distance, b.Distance, WebpConstants.NumDistanceCodes, this.IsUsed[4], b.IsUsed[4], false, stats, bitEntropy);
             if (cost > costThreshold)
             {
                 return false;
@@ -415,9 +416,10 @@ private void AddDistance(Vp8LHistogram b, Vp8LHistogram output, int size)
             }
         }
 
-        private static double GetCombinedEntropy(uint[] x, uint[] y, int length, bool isXUsed, bool isYUsed, bool trivialAtEnd)
+        private static double GetCombinedEntropy(uint[] x, uint[] y, int length, bool isXUsed, bool isYUsed, bool trivialAtEnd, Vp8LStreaks stats, Vp8LBitEntropy bitEntropy)
         {
-            var stats = new Vp8LStreaks();
+            stats.Clear();
+            bitEntropy.Init();
             if (trivialAtEnd)
             {
                 // This configuration is due to palettization that transforms an indexed
@@ -435,7 +437,6 @@ private static double GetCombinedEntropy(uint[] x, uint[] y, int length, bool is
                 return stats.FinalHuffmanCost();
             }
 
-            var bitEntropy = new Vp8LBitEntropy();
             if (isXUsed)
             {
                 if (isYUsed)
@@ -479,10 +480,10 @@ private static double ExtraCostCombined(Span<uint> x, Span<uint> y, int length)
         /// <summary>
         /// Get the symbol entropy for the distribution 'population'.
         /// </summary>
-        private static double PopulationCost(uint[] population, int length, ref uint trivialSym, ref bool isUsed)
+        private static double PopulationCost(uint[] population, int length, ref uint trivialSym, ref bool isUsed, Vp8LStreaks stats, Vp8LBitEntropy bitEntropy)
         {
-            var bitEntropy = new Vp8LBitEntropy();
-            var stats = new Vp8LStreaks();
+            bitEntropy.Init();
+            stats.Clear();
             bitEntropy.BitsEntropyUnrefined(population, length, stats);
 
             trivialSym = (bitEntropy.NoneZeros == 1) ? bitEntropy.NoneZeroCode : NonTrivialSym;
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LStreaks.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LStreaks.cs
index 27ddcfd434..df9f064426 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LStreaks.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LStreaks.cs
@@ -1,6 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
+
 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
     internal class Vp8LStreaks
@@ -28,6 +30,13 @@ public Vp8LStreaks()
         /// </summary>
         public int[][] Streaks { get; }
 
+        public void Clear()
+        {
+            this.Counts.AsSpan().Clear();
+            this.Streaks[0].AsSpan().Clear();
+            this.Streaks[1].AsSpan().Clear();
+        }
+
         public double FinalHuffmanCost()
         {
             // The constants in this function are experimental and got rounded from
diff --git a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
index 768365e44e..4f7a4eb3d8 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
@@ -418,6 +418,7 @@ private void ReadHuffmanCodes(Vp8LDecoder decoder, int xSize, int ySize, int col
             var huffmanTables = new HuffmanCode[numHTreeGroups * tableSize];
             var hTreeGroups = new HTreeGroup[numHTreeGroups];
             Span<HuffmanCode> huffmanTable = huffmanTables.AsSpan();
+            int[] codeLengths = new int[maxAlphabetSize];
             for (int i = 0; i < numHTreeGroupsMax; i++)
             {
                 hTreeGroups[i] = new HTreeGroup(HuffmanUtils.HuffmanPackedTableSize);
@@ -425,7 +426,7 @@ private void ReadHuffmanCodes(Vp8LDecoder decoder, int xSize, int ySize, int col
                 int totalSize = 0;
                 bool isTrivialLiteral = true;
                 int maxBits = 0;
-                int[] codeLengths = new int[maxAlphabetSize];
+                codeLengths.AsSpan().Clear();
                 for (int j = 0; j < WebpConstants.HuffmanCodesPerMetaCode; j++)
                 {
                     int alphabetSize = WebpConstants.AlphabetSize[j];
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 1584237b0c..d31857d53b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -58,14 +58,14 @@ public static void Copy(Span<byte> src, Span<byte> dst, int w, int h)
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w)
+        public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
         {
             int d = 0;
             for (int y = 0; y < 16 * WebpConstants.Bps; y += 4 * WebpConstants.Bps)
             {
                 for (int x = 0; x < 16; x += 4)
                 {
-                    d += Vp8Disto4X4(a.Slice(x + y), b.Slice(x + y), w);
+                    d += Vp8Disto4X4(a.Slice(x + y), b.Slice(x + y), w, scratch);
                 }
             }
 
@@ -73,10 +73,10 @@ public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w)
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w)
+        public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
         {
-            int sum1 = TTransform(a, w);
-            int sum2 = TTransform(b, w);
+            int sum1 = TTransform(a, w, scratch);
+            int sum2 = TTransform(b, w, scratch);
             return Math.Abs(sum2 - sum1) >> 5;
         }
 
@@ -252,18 +252,14 @@ public static void DC4(Span<byte> dst, Span<byte> yuv, int offset)
         [MethodImpl(InliningOptions.ShortMethod)]
         public static void TM4(Span<byte> dst, Span<byte> yuv, int offset) => TrueMotion(dst, yuv, offset, 4);
 
-        public static void VE4(Span<byte> dst, Span<byte> yuv, int offset)
+        public static void VE4(Span<byte> dst, Span<byte> yuv, int offset, Span<byte> vals)
         {
             // vertical
             int topOffset = offset - WebpConstants.Bps;
-            byte[] vals =
-            {
-                Avg3(yuv[topOffset - 1], yuv[topOffset], yuv[topOffset + 1]),
-                Avg3(yuv[topOffset], yuv[topOffset + 1], yuv[topOffset + 2]),
-                Avg3(yuv[topOffset + 1], yuv[topOffset + 2], yuv[topOffset + 3]),
-                Avg3(yuv[topOffset + 2], yuv[topOffset + 3], yuv[topOffset + 4])
-            };
-
+            vals[0] = Avg3(yuv[topOffset - 1], yuv[topOffset], yuv[topOffset + 1]);
+            vals[1] = Avg3(yuv[topOffset], yuv[topOffset + 1], yuv[topOffset + 2]);
+            vals[2] = Avg3(yuv[topOffset + 1], yuv[topOffset + 2], yuv[topOffset + 3]);
+            vals[3] = Avg3(yuv[topOffset + 2], yuv[topOffset + 3], yuv[topOffset + 4]);
             int endIdx = 4 * WebpConstants.Bps;
             for (int i = 0; i < endIdx; i += WebpConstants.Bps)
             {
@@ -504,9 +500,10 @@ public static void HU4(Span<byte> dst, Span<byte> yuv, int offset)
         /// <summary>
         /// Paragraph 14.3: Implementation of the Walsh-Hadamard transform inversion.
         /// </summary>
-        public static void TransformWht(Span<short> input, Span<short> output)
+        public static void TransformWht(Span<short> input, Span<short> output, Span<int> scratch)
         {
-            int[] tmp = new int[16];
+            Span<int> tmp = scratch.Slice(0, 16);
+            tmp.Clear();
             for (int i = 0; i < 4; i++)
             {
                 int iPlus4 = 4 + i;
@@ -544,10 +541,11 @@ public static void TransformWht(Span<short> input, Span<short> output)
         /// Returns the weighted sum of the absolute value of transformed coefficients.
         /// w[] contains a row-major 4 by 4 symmetric matrix.
         /// </summary>
-        public static int TTransform(Span<byte> input, Span<ushort> w)
+        public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch)
         {
             int sum = 0;
-            int[] tmp = new int[16];
+            Span<int> tmp = scratch.Slice(0, 16);
+            tmp.Clear();
 
             // horizontal pass.
             int inputOffset = 0;
@@ -591,15 +589,16 @@ public static int TTransform(Span<byte> input, Span<ushort> w)
             return sum;
         }
 
-        public static void TransformTwo(Span<short> src, Span<byte> dst)
+        public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch)
         {
-            TransformOne(src, dst);
-            TransformOne(src.Slice(16), dst.Slice(4));
+            TransformOne(src, dst, scratch);
+            TransformOne(src.Slice(16), dst.Slice(4), scratch);
         }
 
-        public static void TransformOne(Span<short> src, Span<byte> dst)
+        public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
         {
-            Span<int> tmp = stackalloc int[4 * 4];
+            Span<int> tmp = scratch.Slice(0, 16);
+            tmp.Clear();
             int tmpOffset = 0;
             for (int srcOffset = 0; srcOffset < 4; srcOffset++)
             {
@@ -671,10 +670,10 @@ public static void TransformAc3(Span<short> src, Span<byte> dst)
             Store2(dst, 3, a - d4, d1, c1);
         }
 
-        public static void TransformUv(Span<short> src, Span<byte> dst)
+        public static void TransformUv(Span<short> src, Span<byte> dst, Span<int> scratch)
         {
-            TransformTwo(src.Slice(0 * 16), dst);
-            TransformTwo(src.Slice(2 * 16), dst.Slice(4 * WebpConstants.Bps));
+            TransformTwo(src.Slice(0 * 16), dst, scratch);
+            TransformTwo(src.Slice(2 * 16), dst.Slice(4 * WebpConstants.Bps), scratch);
         }
 
         public static void TransformDcuv(Span<short> src, Span<byte> dst)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 2ed4381660..18d7494f0f 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -31,7 +31,9 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
             int lambda = dqm.LambdaI16;
             int tlambda = dqm.TLambda;
             Span<byte> src = it.YuvIn.AsSpan(Vp8EncIterator.YOffEnc);
+            Span<int> scratch = it.Scratch3;
             var rdTmp = new Vp8ModeScore();
+            var res = new Vp8Residual();
             Vp8ModeScore rdCur = rdTmp;
             Vp8ModeScore rdBest = rd;
             int mode;
@@ -39,7 +41,7 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
             rd.ModeI16 = -1;
             for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
             {
-                // scratch buffer.
+                // Scratch buffer.
                 Span<byte> tmpDst = it.YuvOut2.AsSpan(Vp8EncIterator.YOffEnc);
                 rdCur.ModeI16 = mode;
 
@@ -48,9 +50,9 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
 
                 // Measure RD-score.
                 rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst);
-                rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY)) : 0;
+                rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
                 rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
-                rdCur.R = it.GetCostLuma16(rdCur, proba);
+                rdCur.R = it.GetCostLuma16(rdCur, proba, res);
 
                 if (isFlat)
                 {
@@ -101,6 +103,7 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
             int tlambda = dqm.TLambda;
             Span<byte> src0 = it.YuvIn.AsSpan(Vp8EncIterator.YOffEnc);
             Span<byte> bestBlocks = it.YuvOut2.AsSpan(Vp8EncIterator.YOffEnc);
+            Span<int> scratch = it.Scratch3;
             int totalHeaderBits = 0;
             var rdBest = new Vp8ModeScore();
 
@@ -113,31 +116,35 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
             rdBest.H = 211;  // '211' is the value of VP8BitCost(0, 145)
             rdBest.SetRdScore(dqm.LambdaMode);
             it.StartI4();
+            var rdi4 = new Vp8ModeScore();
+            var rdTmp = new Vp8ModeScore();
+            var res = new Vp8Residual();
+            Span<short> tmpLevels = new short[16];
             do
             {
                 int numBlocks = 1;
-                var rdi4 = new Vp8ModeScore();
+                rdi4.Clear();
                 int mode;
                 int bestMode = -1;
                 Span<byte> src = src0.Slice(WebpLookupTables.Vp8Scan[it.I4]);
                 short[] modeCosts = it.GetCostModeI4(rd.ModesI4);
                 Span<byte> bestBlock = bestBlocks.Slice(WebpLookupTables.Vp8Scan[it.I4]);
                 Span<byte> tmpDst = it.Scratch.AsSpan();
-                tmpDst.Fill(0);
+                tmpDst.Clear();
 
                 rdi4.InitScore();
                 it.MakeIntra4Preds();
                 for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
                 {
-                    var rdTmp = new Vp8ModeScore();
-                    short[] tmpLevels = new short[16];
+                    rdTmp.Clear();
+                    tmpLevels.Clear();
 
                     // Reconstruct.
                     rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);
 
                     // Compute RD-score.
                     rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst);
-                    rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY)) : 0;
+                    rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
                     rdTmp.H = modeCosts[mode];
 
                     // Add flatness penalty, to avoid flat area to be mispredicted by a complex mode.
@@ -150,15 +157,15 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
                         rdTmp.R = 0;
                     }
 
-                    // early-out check.
+                    // Early-out check.
                     rdTmp.SetRdScore(lambda);
                     if (bestMode >= 0 && rdTmp.Score >= rdi4.Score)
                     {
                         continue;
                     }
 
-                    // finish computing score.
-                    rdTmp.R += it.GetCostLuma4(tmpLevels, proba);
+                    // Finish computing score.
+                    rdTmp.R += it.GetCostLuma4(tmpLevels, proba, res);
                     rdTmp.SetRdScore(lambda);
 
                     if (bestMode < 0 || rdTmp.Score < rdi4.Score)
@@ -213,13 +220,15 @@ public static void PickBestUv(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Segment
             Span<byte> dst0 = it.YuvOut.AsSpan(Vp8EncIterator.UOffEnc);
             Span<byte> dst = dst0;
             var rdBest = new Vp8ModeScore();
+            var rdUv = new Vp8ModeScore();
+            var res = new Vp8Residual();
             int mode;
 
             rd.ModeUv = -1;
             rdBest.InitScore();
             for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
             {
-                var rdUv = new Vp8ModeScore();
+                rdUv.Clear();
 
                 // Reconstruct
                 rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);
@@ -228,7 +237,7 @@ public static void PickBestUv(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Segment
                 rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst);
                 rdUv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
                 rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
-                rdUv.R = it.GetCostUv(rdUv, proba);
+                rdUv.R = it.GetCostUv(rdUv, proba, res);
                 if (mode > 0 && IsFlat(rdUv.UvLevels, numBlocks, WebpConstants.FlatnessLimitIUv))
                 {
                     rdUv.R += WebpConstants.FlatnessPenality * numBlocks;
@@ -271,16 +280,24 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
             Span<byte> src = it.YuvIn.AsSpan(Vp8EncIterator.YOffEnc);
             int nz = 0;
             int n;
-            short[] dcTmp = new short[16];
-            short[] tmp = new short[16 * 16];
-            Span<short> tmpSpan = tmp.AsSpan();
+            Span<short> shortScratchSpan = it.Scratch2.AsSpan();
+            Span<int> scratch = it.Scratch3.AsSpan(0, 16);
+            shortScratchSpan.Clear();
+            scratch.Clear();
+            Span<short> dcTmp = shortScratchSpan.Slice(0, 16);
+            Span<short> tmp = shortScratchSpan.Slice(16, 16 * 16);
 
             for (n = 0; n < 16; n += 2)
             {
-                Vp8Encoding.FTransform2(src.Slice(WebpLookupTables.Vp8Scan[n]), reference.Slice(WebpLookupTables.Vp8Scan[n]), tmpSpan.Slice(n * 16, 16), tmpSpan.Slice((n + 1) * 16, 16));
+                Vp8Encoding.FTransform2(
+                    src.Slice(WebpLookupTables.Vp8Scan[n]),
+                    reference.Slice(WebpLookupTables.Vp8Scan[n]),
+                    tmp.Slice(n * 16, 16),
+                    tmp.Slice((n + 1) * 16, 16),
+                    scratch);
             }
 
-            Vp8Encoding.FTransformWht(tmp, dcTmp);
+            Vp8Encoding.FTransformWht(tmp, dcTmp, scratch);
             nz |= QuantizeBlock(dcTmp, rd.YDcLevels, dqm.Y2) << 24;
 
             for (n = 0; n < 16; n += 2)
@@ -288,14 +305,14 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
                 // Zero-out the first coeff, so that: a) nz is correct below, and
                 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
                 tmp[n * 16] = tmp[(n + 1) * 16] = 0;
-                nz |= Quantize2Blocks(tmpSpan.Slice(n * 16, 32), rd.YAcLevels.AsSpan(n * 16, 32), dqm.Y1) << n;
+                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.YAcLevels.AsSpan(n * 16, 32), dqm.Y1) << n;
             }
 
             // Transform back.
-            LossyUtils.TransformWht(dcTmp, tmpSpan);
+            LossyUtils.TransformWht(dcTmp, tmp, scratch);
             for (n = 0; n < 16; n += 2)
             {
-                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmpSpan.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true);
+                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch);
             }
 
             return nz;
@@ -304,10 +321,13 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
         public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<short> levels, Span<byte> src, Span<byte> yuvOut, int mode)
         {
             Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
-            short[] tmp = new short[16];
-            Vp8Encoding.FTransform(src, reference, tmp);
+            Span<short> tmp = it.Scratch2.AsSpan(0, 16);
+            Span<int> scratch = it.Scratch3.AsSpan(0, 16);
+            tmp.Clear();
+            scratch.Clear();
+            Vp8Encoding.FTransform(src, reference, tmp, scratch);
             int nz = QuantizeBlock(tmp, levels, dqm.Y1);
-            Vp8Encoding.ITransform(reference, tmp, yuvOut, false);
+            Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
 
             return nz;
         }
@@ -318,27 +338,31 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
             Span<byte> src = it.YuvIn.AsSpan(Vp8EncIterator.UOffEnc);
             int nz = 0;
             int n;
-            short[] tmp = new short[8 * 16];
+            Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16);
+            Span<int> scratch = it.Scratch3.AsSpan(0, 16);
+            tmp.Clear();
+            scratch.Clear();
 
             for (n = 0; n < 8; n += 2)
             {
                 Vp8Encoding.FTransform2(
                     src.Slice(WebpLookupTables.Vp8ScanUv[n]),
                     reference.Slice(WebpLookupTables.Vp8ScanUv[n]),
-                    tmp.AsSpan(n * 16, 16),
-                    tmp.AsSpan((n + 1) * 16, 16));
+                    tmp.Slice(n * 16, 16),
+                    tmp.Slice((n + 1) * 16, 16),
+                    scratch);
             }
 
             CorrectDcValues(it, dqm.Uv, tmp, rd);
 
             for (n = 0; n < 8; n += 2)
             {
-                nz |= Quantize2Blocks(tmp.AsSpan(n * 16, 32), rd.UvLevels.AsSpan(n * 16, 32), dqm.Uv) << n;
+                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.UvLevels.AsSpan(n * 16, 32), dqm.Uv) << n;
             }
 
             for (n = 0; n < 8; n += 2)
             {
-                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.AsSpan(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true);
+                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch);
             }
 
             return nz << 16;
@@ -556,7 +580,7 @@ public static int QuantizeSingle(Span<short> v, Vp8Matrix mtx)
             return (sign ? -v0 : v0) >> DSCALE;
         }
 
-        public static void CorrectDcValues(Vp8EncIterator it, Vp8Matrix mtx, short[] tmp, Vp8ModeScore rd)
+        public static void CorrectDcValues(Vp8EncIterator it, Vp8Matrix mtx, Span<short> tmp, Vp8ModeScore rd)
         {
 #pragma warning disable SA1005 // Single line comments should begin with single space
             //         | top[0] | top[1]
@@ -571,7 +595,7 @@ public static void CorrectDcValues(Vp8EncIterator it, Vp8Matrix mtx, short[] tmp
             {
                 Span<sbyte> top = it.TopDerr.AsSpan((it.X * 4) + ch, 2);
                 Span<sbyte> left = it.LeftDerr.AsSpan(ch, 2);
-                Span<short> c = tmp.AsSpan(ch * 4 * 16, 4 * 16);
+                Span<short> c = tmp.Slice(ch * 4 * 16, 4 * 16);
                 c[0] += (short)(((C1 * top[0]) + (C2 * left[0])) >> (DSHIFT - DSCALE));
                 int err0 = QuantizeSingle(c, mtx);
                 c[1 * 16] += (short)(((C1 * top[1]) + (C2 * err0)) >> (DSHIFT - DSCALE));
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
index ca3f8481e2..79fd8d8543 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
@@ -81,6 +81,8 @@ public Vp8EncIterator(byte[] yTop, byte[] uvTop, uint[] nz, Vp8MacroBlockInfo[]
             this.I4Boundary = new byte[37];
             this.BitCount = new long[4, 3];
             this.Scratch = new byte[WebpConstants.Bps * 16];
+            this.Scratch2 = new short[17 * 16];
+            this.Scratch3 = new int[16];
 
             // To match the C initial values of the reference implementation, initialize all with 204.
             byte defaultInitVal = 204;
@@ -216,10 +218,20 @@ public Vp8EncIterator(byte[] yTop, byte[] uvTop, uint[] nz, Vp8MacroBlockInfo[]
         public int CountDown { get; set; }
 
         /// <summary>
-        /// Gets the scratch buffer.
+        /// Gets the byte scratch buffer.
         /// </summary>
         public byte[] Scratch { get; }
 
+        /// <summary>
+        /// Gets the short scratch buffer.
+        /// </summary>
+        public short[] Scratch2 { get; }
+
+        /// <summary>
+        /// Gets the int scratch buffer.
+        /// </summary>
+        public int[] Scratch3 { get; }
+
         public Vp8MacroBlockInfo CurrentMacroBlockInfo => this.Mb[this.currentMbIdx];
 
         private Vp8MacroBlockInfo[] Mb { get; }
@@ -380,7 +392,7 @@ public int MbAnalyzeBestIntra16Mode()
             int bestMode = 0;
 
             this.MakeLuma16Preds();
-            for (mode = 0; mode < maxMode; ++mode)
+            for (mode = 0; mode < maxMode; mode++)
             {
                 var histo = new Vp8Histogram();
                 histo.CollectHistogram(this.YuvIn.AsSpan(YOffEnc), this.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]), 0, 16);
@@ -499,9 +511,8 @@ public void SetIntra4Mode(byte[] modes)
             this.CurrentMacroBlockInfo.MacroBlockType = Vp8MacroBlockType.I4X4;
         }
 
-        public int GetCostLuma16(Vp8ModeScore rd, Vp8EncProba proba)
+        public int GetCostLuma16(Vp8ModeScore rd, Vp8EncProba proba, Vp8Residual res)
         {
-            var res = new Vp8Residual();
             int r = 0;
 
             // re-import the non-zero context.
@@ -539,11 +550,10 @@ public short[] GetCostModeI4(byte[] modes)
             return WebpLookupTables.Vp8FixedCostsI4[top, left];
         }
 
-        public int GetCostLuma4(short[] levels, Vp8EncProba proba)
+        public int GetCostLuma4(Span<short> levels, Vp8EncProba proba, Vp8Residual res)
         {
             int x = this.I4 & 3;
             int y = this.I4 >> 2;
-            var res = new Vp8Residual();
             int r = 0;
 
             res.Init(0, 3, proba);
@@ -553,9 +563,8 @@ public int GetCostLuma4(short[] levels, Vp8EncProba proba)
             return r;
         }
 
-        public int GetCostUv(Vp8ModeScore rd, Vp8EncProba proba)
+        public int GetCostUv(Vp8ModeScore rd, Vp8EncProba proba, Vp8Residual res)
         {
-            var res = new Vp8Residual();
             int r = 0;
 
             // re-import the non-zero context.
@@ -741,7 +750,7 @@ public void MakeChroma8Preds()
             Vp8Encoding.EncPredChroma8(this.YuvP, left, top);
         }
 
-        public void MakeIntra4Preds() => Vp8Encoding.EncPredLuma4(this.YuvP, this.I4Boundary, this.I4BoundaryIdx);
+        public void MakeIntra4Preds() => Vp8Encoding.EncPredLuma4(this.YuvP, this.I4Boundary, this.I4BoundaryIdx, this.Scratch.AsSpan(0, 4));
 
         public void SwapOut()
         {
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 37808d56c2..1a9d3a6e34 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -70,6 +70,11 @@ internal class Vp8Encoder : IDisposable
         /// </summary>
         private int uvAlpha;
 
+        /// <summary>
+        /// Scratch buffer to reduce allocations.
+        /// </summary>
+        private readonly int[] scratch = new int[16];
+
         private readonly byte[] averageBytesPerMb = { 50, 24, 16, 9, 7, 5, 3, 2 };
 
         private const int NumMbSegments = 4;
@@ -321,18 +326,19 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream)
             this.StatLoop(width, height, yStride, uvStride);
             it.Init();
             it.InitFilter();
+            var info = new Vp8ModeScore();
+            var residual = new Vp8Residual();
             do
             {
                 bool dontUseSkip = !this.Proba.UseSkipProba;
-
-                var info = new Vp8ModeScore();
+                info.Clear();
                 it.Import(y, u, v, yStride, uvStride, width, height, false);
 
                 // Warning! order is important: first call VP8Decimate() and
                 // *then* decide how to code the skip decision if there's one.
                 if (!this.Decimate(it, ref info, this.rdOptLevel) || dontUseSkip)
                 {
-                    this.CodeResiduals(it, info);
+                    this.CodeResiduals(it, info, residual);
                 }
                 else
                 {
@@ -447,9 +453,10 @@ private long OneStatPass(int width, int height, int yStride, int uvStride, Vp8Rd
 
             it.Init();
             this.SetLoopParams(stats.Q);
+            var info = new Vp8ModeScore();
             do
             {
-                var info = new Vp8ModeScore();
+                info.Clear();
                 it.Import(y, u, v, yStride, uvStride, width, height, false);
                 if (this.Decimate(it, ref info, rdOpt))
                 {
@@ -930,10 +937,9 @@ private bool Decimate(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8RdLevel rdOpt)
             return isSkipped;
         }
 
-        private void CodeResiduals(Vp8EncIterator it, Vp8ModeScore rd)
+        private void CodeResiduals(Vp8EncIterator it, Vp8ModeScore rd, Vp8Residual residual)
         {
             int x, y, ch;
-            var residual = new Vp8Residual();
             bool i16 = it.CurrentMacroBlockInfo.MacroBlockType == Vp8MacroBlockType.I16X16;
             int segment = it.CurrentMacroBlockInfo.Segment;
 
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index f8b4853e2a..0567a0f27d 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -68,22 +68,20 @@ static Vp8Encoding()
             }
         }
 
-        public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo)
+        public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch)
         {
-            ITransformOne(reference, input, dst);
+            ITransformOne(reference, input, dst, scratch);
             if (doTwo)
             {
-                ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4));
+                ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
             }
         }
 
-        public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst)
+        public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
         {
             int i;
-#pragma warning disable SA1312 // Variable names should begin with lower-case letter
-            int[] C = new int[4 * 4];
-#pragma warning restore SA1312 // Variable names should begin with lower-case letter
-            Span<int> tmp = C.AsSpan();
+            Span<int> tmp = scratch.Slice(0, 16);
+            tmp.Clear();
             for (i = 0; i < 4; i++)
             {
                 // vertical pass.
@@ -99,7 +97,7 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
                 input = input.Slice(1);
             }
 
-            tmp = C.AsSpan();
+            tmp = scratch;
             for (i = 0; i < 4; i++)
             {
                 // horizontal pass.
@@ -116,16 +114,18 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
             }
         }
 
-        public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2)
+        public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
         {
-            FTransform(src, reference, output);
-            FTransform(src.Slice(4), reference.Slice(4), output2);
+            FTransform(src, reference, output, scratch);
+            FTransform(src.Slice(4), reference.Slice(4), output2, scratch);
         }
 
-        public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output)
+        public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch)
         {
             int i;
-            int[] tmp = new int[16];
+            Span<int> tmp = scratch.Slice(0, 16);
+            tmp.Clear();
+
             int srcIdx = 0;
             int refIdx = 0;
             for (i = 0; i < 4; i++)
@@ -160,9 +160,11 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
             }
         }
 
-        public static void FTransformWht(Span<short> input, Span<short> output)
+        public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
         {
-            int[] tmp = new int[16];
+            Span<int> tmp = scratch.Slice(0, 16);
+            tmp.Clear();
+
             int i;
             int inputIdx = 0;
             for (i = 0; i < 4; i++)
@@ -234,11 +236,11 @@ public static void EncPredChroma8(Span<byte> dst, Span<byte> left, Span<byte> to
 
         // Left samples are top[-5 .. -2], top_left is top[-1], top are
         // located at top[0..3], and top right is top[4..7]
-        public static void EncPredLuma4(Span<byte> dst, Span<byte> top, int topOffset)
+        public static void EncPredLuma4(Span<byte> dst, Span<byte> top, int topOffset, Span<byte> vals)
         {
             Dc4(dst.Slice(I4DC4), top, topOffset);
             Tm4(dst.Slice(I4TM4), top, topOffset);
-            Ve4(dst.Slice(I4VE4), top, topOffset);
+            Ve4(dst.Slice(I4VE4), top, topOffset, vals);
             He4(dst.Slice(I4HE4), top, topOffset);
             Rd4(dst.Slice(I4RD4), top, topOffset);
             Vr4(dst.Slice(I4VR4), top, topOffset);
@@ -395,20 +397,16 @@ private static void Tm4(Span<byte> dst, Span<byte> top, int topOffset)
             }
         }
 
-        private static void Ve4(Span<byte> dst, Span<byte> top, int topOffset)
+        private static void Ve4(Span<byte> dst, Span<byte> top, int topOffset, Span<byte> vals)
         {
             // vertical
-            byte[] vals =
-            {
-                LossyUtils.Avg3(top[topOffset - 1], top[topOffset], top[topOffset + 1]),
-                LossyUtils.Avg3(top[topOffset], top[topOffset + 1], top[topOffset + 2]),
-                LossyUtils.Avg3(top[topOffset + 1], top[topOffset + 2], top[topOffset + 3]),
-                LossyUtils.Avg3(top[topOffset + 2], top[topOffset + 3], top[topOffset + 4])
-            };
-
+            vals[0] = LossyUtils.Avg3(top[topOffset - 1], top[topOffset], top[topOffset + 1]);
+            vals[1] = LossyUtils.Avg3(top[topOffset], top[topOffset + 1], top[topOffset + 2]);
+            vals[2] = LossyUtils.Avg3(top[topOffset + 1], top[topOffset + 2], top[topOffset + 3]);
+            vals[3] = LossyUtils.Avg3(top[topOffset + 2], top[topOffset + 3], top[topOffset + 4]);
             for (int i = 0; i < 4; i++)
             {
-                vals.AsSpan().CopyTo(dst.Slice(i * WebpConstants.Bps));
+                vals.CopyTo(dst.Slice(i * WebpConstants.Bps));
             }
         }
 
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
index 5d048514ea..7192fa2d05 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
@@ -8,6 +8,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
     internal class Vp8Histogram
     {
+        private readonly int[] scratch = new int[16];
+
+        private readonly short[] output = new short[16];
+
+        private readonly int[] distribution = new int[MaxCoeffThresh + 1];
+
         /// <summary>
         /// Size of histogram used by CollectHistogram.
         /// </summary>
@@ -40,23 +46,22 @@ public int GetAlpha()
         public void CollectHistogram(Span<byte> reference, Span<byte> pred, int startBlock, int endBlock)
         {
             int j;
-            int[] distribution = new int[MaxCoeffThresh + 1];
+            this.distribution.AsSpan().Clear();
             for (j = startBlock; j < endBlock; j++)
             {
-                short[] output = new short[16];
-
-                this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), output);
+                this.output.AsSpan().Clear();
+                this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);
 
                 // Convert coefficients to bin.
                 for (int k = 0; k < 16; ++k)
                 {
-                    int v = Math.Abs(output[k]) >> 3;
+                    int v = Math.Abs(this.output[k]) >> 3;
                     int clippedValue = ClipMax(v, MaxCoeffThresh);
-                    ++distribution[clippedValue];
+                    ++this.distribution[clippedValue];
                 }
             }
 
-            this.SetHistogramData(distribution);
+            this.SetHistogramData(this.distribution);
         }
 
         public void Merge(Vp8Histogram other)
@@ -97,7 +102,9 @@ private void SetHistogramData(int[] distribution)
         private void Vp8FTransform(Span<byte> src, Span<byte> reference, Span<short> output)
         {
             int i;
-            int[] tmp = new int[16];
+            Span<int> tmp = this.scratch;
+            tmp.Clear();
+
             for (i = 0; i < 4; i++)
             {
                 int d0 = src[0] - reference[0];   // 9bit dynamic range ([-255,255])
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
index 7182f60210..1c92a9d2d9 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
@@ -1,6 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
+
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
     /// <summary>
@@ -93,6 +95,22 @@ public Vp8ModeScore()
         /// </summary>
         public int[,] Derr { get; }
 
+        public void Clear()
+        {
+            this.YDcLevels.AsSpan().Clear();
+            this.YAcLevels.AsSpan().Clear();
+            this.UvLevels.AsSpan().Clear();
+            this.ModesI4.AsSpan().Clear();
+
+            for (int i = 0; i < 2; i++)
+            {
+                for (int j = 0; j < 3; j++)
+                {
+                    this.Derr[i, j] = 0;
+                }
+            }
+        }
+
         public void InitScore()
         {
             this.D = 0;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs
index 93d76e2836..2962ebbabc 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs
@@ -16,7 +16,7 @@ internal class Vp8Residual
 
         public int CoeffType { get; set; }
 
-        public short[] Coeffs { get; set; }
+        public short[] Coeffs { get; } = new short[16];
 
         public Vp8BandProbas[] Prob { get; set; }
 
@@ -31,6 +31,7 @@ public void Init(int first, int coeffType, Vp8EncProba prob)
             this.Prob = prob.Coeffs[this.CoeffType];
             this.Stats = prob.Stats[this.CoeffType];
             this.Costs = prob.RemappedCosts[this.CoeffType];
+            this.Coeffs.AsSpan().Clear();
         }
 
         public void SetCoeffs(Span<short> coeffs)
@@ -46,7 +47,7 @@ public void SetCoeffs(Span<short> coeffs)
                 }
             }
 
-            this.Coeffs = coeffs.Slice(0, 16).ToArray();
+            coeffs.Slice(0, 16).CopyTo(this.Coeffs);
         }
 
         // Simulate block coding, but only record statistics.
diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
index ebb0b0aa4a..4f283f9f53 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
@@ -34,6 +34,16 @@ internal sealed class WebpLossyDecoder
         /// </summary>
         private readonly Configuration configuration;
 
+        /// <summary>
+        /// Scratch buffer to reduce allocations.
+        /// </summary>
+        private readonly int[] scratch = new int[16];
+
+        /// <summary>
+        /// Another scratch buffer to reduce allocations.
+        /// </summary>
+        private readonly byte[] scratchBytes = new byte[4];
+
         /// <summary>
         /// Initializes a new instance of the <see cref="WebpLossyDecoder"/> class.
         /// </summary>
@@ -395,7 +405,7 @@ private void ReconstructRow(Vp8Decoder dec)
                                 LossyUtils.TM4(dst, yuv, offset);
                                 break;
                             case 2:
-                                LossyUtils.VE4(dst, yuv, offset);
+                                LossyUtils.VE4(dst, yuv, offset, this.scratchBytes);
                                 break;
                             case 3:
                                 LossyUtils.HE4(dst, yuv, offset);
@@ -420,7 +430,7 @@ private void ReconstructRow(Vp8Decoder dec)
                                 break;
                         }
 
-                        this.DoTransform(bits, coeffs.AsSpan(n * 16), dst);
+                        this.DoTransform(bits, coeffs.AsSpan(n * 16), dst, this.scratch);
                     }
                 }
                 else
@@ -456,7 +466,7 @@ private void ReconstructRow(Vp8Decoder dec)
                     {
                         for (int n = 0; n < 16; ++n, bits <<= 2)
                         {
-                            this.DoTransform(bits, coeffs.AsSpan(n * 16), yDst.Slice(WebpConstants.Scan[n]));
+                            this.DoTransform(bits, coeffs.AsSpan(n * 16), yDst.Slice(WebpConstants.Scan[n]), this.scratch);
                         }
                     }
                 }
@@ -496,8 +506,8 @@ private void ReconstructRow(Vp8Decoder dec)
                         break;
                 }
 
-                this.DoUVTransform(bitsUv, coeffs.AsSpan(16 * 16), uDst);
-                this.DoUVTransform(bitsUv >> 8, coeffs.AsSpan(20 * 16), vDst);
+                this.DoUVTransform(bitsUv, coeffs.AsSpan(16 * 16), uDst, this.scratch);
+                this.DoUVTransform(bitsUv >> 8, coeffs.AsSpan(20 * 16), vDst, this.scratch);
 
                 // Stash away top samples for next block.
                 if (mby < dec.MbHeight - 1)
@@ -787,12 +797,12 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
             }
         }
 
-        private void DoTransform(uint bits, Span<short> src, Span<byte> dst)
+        private void DoTransform(uint bits, Span<short> src, Span<byte> dst, Span<int> scratch)
         {
             switch (bits >> 30)
             {
                 case 3:
-                    LossyUtils.TransformOne(src, dst);
+                    LossyUtils.TransformOne(src, dst, scratch);
                     break;
                 case 2:
                     LossyUtils.TransformAc3(src, dst);
@@ -803,7 +813,7 @@ private void DoTransform(uint bits, Span<short> src, Span<byte> dst)
             }
         }
 
-        private void DoUVTransform(uint bits, Span<short> src, Span<byte> dst)
+        private void DoUVTransform(uint bits, Span<short> src, Span<byte> dst, Span<int> scratch)
         {
             // any non-zero coeff at all?
             if ((bits & 0xff) > 0)
@@ -811,7 +821,7 @@ private void DoUVTransform(uint bits, Span<short> src, Span<byte> dst)
                 // any non-zero AC coefficient?
                 if ((bits & 0xaa) > 0)
                 {
-                    LossyUtils.TransformUv(src, dst); // note we don't use the AC3 variant for U/V.
+                    LossyUtils.TransformUv(src, dst, scratch); // note we don't use the AC3 variant for U/V.
                 }
                 else
                 {
@@ -884,7 +894,7 @@ private bool ParseResiduals(Vp8Decoder dec, Vp8BitReader br, Vp8MacroBlock mb)
                 if (nz > 1)
                 {
                     // More than just the DC -> perform the full transform.
-                    LossyUtils.TransformWht(dc, dst);
+                    LossyUtils.TransformWht(dc, dst, this.scratch);
                 }
                 else
                 {
diff --git a/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs b/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs
index b480201989..d78f7e2f2a 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs
@@ -90,9 +90,10 @@ private static void RunColorSpaceTransformTestWithPeakImage()
             int transformWidth = LosslessUtils.SubSampleSize(image.Width, colorTransformBits);
             int transformHeight = LosslessUtils.SubSampleSize(image.Height, colorTransformBits);
             uint[] transformData = new uint[transformWidth * transformHeight];
+            int[] scratch = new int[256];
 
             // act
-            PredictorEncoder.ColorSpaceTransform(image.Width, image.Height, colorTransformBits, 75, bgra, transformData);
+            PredictorEncoder.ColorSpaceTransform(image.Width, image.Height, colorTransformBits, 75, bgra, transformData, scratch);
 
             // assert
             Assert.Equal(expectedData, transformData);
@@ -119,9 +120,10 @@ private static void RunColorSpaceTransformTestWithBikeImage()
             int transformWidth = LosslessUtils.SubSampleSize(image.Width, colorTransformBits);
             int transformHeight = LosslessUtils.SubSampleSize(image.Height, colorTransformBits);
             uint[] transformData = new uint[transformWidth * transformHeight];
+            int[] scratch = new int[256];
 
             // act
-            PredictorEncoder.ColorSpaceTransform(image.Width, image.Height, colorTransformBits, 75, bgra, transformData);
+            PredictorEncoder.ColorSpaceTransform(image.Width, image.Height, colorTransformBits, 75, bgra, transformData, scratch);
 
             // assert
             Assert.Equal(expectedData, transformData);

From ed8d2afcb07d7f56e48f1b59351d229389aaea3a Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 31 Oct 2021 13:26:31 +0100
Subject: [PATCH 09/85] Use Span version of Sort() to reduce allocations

---
 src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs | 5 +++++
 src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs  | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
index f2321d6813..6320983bab 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
@@ -202,9 +202,14 @@ public static void GenerateOptimalTree(HuffmanTree[] tree, uint[] histogram, int
                 }
 
                 // Build the Huffman tree.
+#if NET5_0_OR_GREATER
+                Span<HuffmanTree> treeSlice = tree.AsSpan().Slice(0, treeSize);
+                treeSlice.Sort(HuffmanTree.Compare);
+#else
                 HuffmanTree[] treeCopy = tree.AsSpan().Slice(0, treeSize).ToArray();
                 Array.Sort(treeCopy, HuffmanTree.Compare);
                 treeCopy.AsSpan().CopyTo(tree);
+#endif
 
                 if (treeSize > 1)
                 {
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index 818488696e..29dbde8b03 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -1204,9 +1204,14 @@ private bool AnalyzeAndCreatePalette(ReadOnlySpan<uint> bgra, int width, int hei
                 return false;
             }
 
+#if NET5_0_OR_GREATER
+            var paletteSlice = palette.Slice(0, this.PaletteSize);
+            paletteSlice.Sort();
+#else
             uint[] paletteArray = palette.Slice(0, this.PaletteSize).ToArray();
             Array.Sort(paletteArray);
             paletteArray.CopyTo(palette);
+#endif
 
             if (PaletteHasNonMonotonousDeltas(palette, this.PaletteSize))
             {

From 15a10126d29f5e6b9c42544bc0cb4388cf32bdfe Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 31 Oct 2021 14:10:21 +0100
Subject: [PATCH 10/85] Define sse and avx masks as static readonly

---
 .../Formats/Webp/Lossless/LosslessUtils.cs    | 65 +++++++++++--------
 .../Formats/Webp/Lossless/PredictorEncoder.cs | 43 +++++++-----
 .../Formats/Webp/WebpCommonUtils.cs           | 56 ++++++++--------
 3 files changed, 93 insertions(+), 71 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 06204ae913..c195eb0fe1 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -27,6 +27,30 @@ internal static unsafe class LosslessUtils
 
         private const double Log2Reciprocal = 1.44269504088896338700465094007086;
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector256<byte> AddGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
+
+        private static readonly Vector128<byte> AddGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
+
+        private static readonly byte AddGreenToBlueAndRedShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
+
+        private static readonly Vector256<byte> SubtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
+
+        private static readonly Vector128<byte> SubtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
+
+        private static readonly byte SubtractGreenFromBlueAndRedShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
+
+        private static readonly Vector128<byte> TransformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+        private static readonly Vector128<byte> TransformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+
+        private static readonly byte TransformColorShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
+
+        private static readonly Vector128<byte> TransformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+        private static readonly byte TransformColorInverseShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
+#endif
+
         /// <summary>
         /// Returns the exact index where array1 and array2 are different. For an index
         /// inferior or equal to bestLenMatch, the return value just has to be strictly
@@ -97,7 +121,6 @@ public static void AddGreenToBlueAndRed(Span<uint> pixelData)
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx2.IsSupported)
             {
-                var mask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
                 int numPixels = pixelData.Length;
                 fixed (uint* p = pixelData)
                 {
@@ -106,7 +129,7 @@ public static void AddGreenToBlueAndRed(Span<uint> pixelData)
                     {
                         uint* idx = p + i;
                         Vector256<byte> input = Avx.LoadVector256((ushort*)idx).AsByte();
-                        Vector256<byte> in0g0g = Avx2.Shuffle(input, mask);
+                        Vector256<byte> in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2);
                         Vector256<byte> output = Avx2.Add(input, in0g0g);
                         Avx.Store((byte*)idx, output);
                     }
@@ -119,7 +142,6 @@ public static void AddGreenToBlueAndRed(Span<uint> pixelData)
             }
             else if (Ssse3.IsSupported)
             {
-                var mask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
                 int numPixels = pixelData.Length;
                 fixed (uint* p = pixelData)
                 {
@@ -128,7 +150,7 @@ public static void AddGreenToBlueAndRed(Span<uint> pixelData)
                     {
                         uint* idx = p + i;
                         Vector128<byte> input = Sse2.LoadVector128((ushort*)idx).AsByte();
-                        Vector128<byte> in0g0g = Ssse3.Shuffle(input, mask);
+                        Vector128<byte> in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3);
                         Vector128<byte> output = Sse2.Add(input, in0g0g);
                         Sse2.Store((byte*)idx, output.AsByte());
                     }
@@ -141,7 +163,6 @@ public static void AddGreenToBlueAndRed(Span<uint> pixelData)
             }
             else if (Sse2.IsSupported)
             {
-                byte mask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
                 int numPixels = pixelData.Length;
                 fixed (uint* p = pixelData)
                 {
@@ -151,8 +172,8 @@ public static void AddGreenToBlueAndRed(Span<uint> pixelData)
                         uint* idx = p + i;
                         Vector128<ushort> input = Sse2.LoadVector128((ushort*)idx);
                         Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                        Vector128<ushort> b = Sse2.ShuffleLow(a, mask);
-                        Vector128<ushort> c = Sse2.ShuffleHigh(b, mask); // 0g0g
+                        Vector128<ushort> b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask);
+                        Vector128<ushort> c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g
                         Vector128<byte> output = Sse2.Add(input.AsByte(), c.AsByte());
                         Sse2.Store((byte*)idx, output);
                     }
@@ -189,7 +210,6 @@ public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx2.IsSupported)
             {
-                var mask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
                 int numPixels = pixelData.Length;
                 fixed (uint* p = pixelData)
                 {
@@ -198,7 +218,7 @@ public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
                     {
                         uint* idx = p + i;
                         Vector256<byte> input = Avx.LoadVector256((ushort*)idx).AsByte();
-                        Vector256<byte> in0g0g = Avx2.Shuffle(input, mask);
+                        Vector256<byte> in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2);
                         Vector256<byte> output = Avx2.Subtract(input, in0g0g);
                         Avx.Store((byte*)idx, output);
                     }
@@ -211,7 +231,6 @@ public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
             }
             else if (Ssse3.IsSupported)
             {
-                var mask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
                 int numPixels = pixelData.Length;
                 fixed (uint* p = pixelData)
                 {
@@ -220,7 +239,7 @@ public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
                     {
                         uint* idx = p + i;
                         Vector128<byte> input = Sse2.LoadVector128((ushort*)idx).AsByte();
-                        Vector128<byte> in0g0g = Ssse3.Shuffle(input, mask);
+                        Vector128<byte> in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3);
                         Vector128<byte> output = Sse2.Subtract(input, in0g0g);
                         Sse2.Store((byte*)idx, output.AsByte());
                     }
@@ -233,7 +252,6 @@ public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
             }
             else if (Sse2.IsSupported)
             {
-                byte mask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
                 int numPixels = pixelData.Length;
                 fixed (uint* p = pixelData)
                 {
@@ -243,8 +261,8 @@ public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
                         uint* idx = p + i;
                         Vector128<ushort> input = Sse2.LoadVector128((ushort*)idx);
                         Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                        Vector128<ushort> b = Sse2.ShuffleLow(a, mask);
-                        Vector128<ushort> c = Sse2.ShuffleHigh(b, mask); // 0g0g
+                        Vector128<ushort> b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask);
+                        Vector128<ushort> c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g
                         Vector128<byte> output = Sse2.Subtract(input.AsByte(), c.AsByte());
                         Sse2.Store((byte*)idx, output);
                     }
@@ -394,9 +412,6 @@ public static void TransformColor(Vp8LMultipliers m, Span<uint> data, int numPix
             {
                 Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
                 Vector128<int> multsb2 = MkCst16(Cst5b(m.RedToBlue), 0);
-                var maskalphagreen = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-                var maskredblue = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
-                byte shufflemask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
                 fixed (uint* src = data)
                 {
                     int idx;
@@ -404,15 +419,15 @@ public static void TransformColor(Vp8LMultipliers m, Span<uint> data, int numPix
                     {
                         uint* pos = src + idx;
                         Vector128<uint> input = Sse2.LoadVector128(pos);
-                        Vector128<byte> a = Sse2.And(input.AsByte(), maskalphagreen);
-                        Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), shufflemask);
-                        Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), shufflemask);
+                        Vector128<byte> a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask);
+                        Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask);
+                        Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask);
                         Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
                         Vector128<short> e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
                         Vector128<short> f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
                         Vector128<int> g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
                         Vector128<byte> h = Sse2.Add(g.AsByte(), d.AsByte());
-                        Vector128<byte> i = Sse2.And(h, maskredblue);
+                        Vector128<byte> i = Sse2.And(h, TransformColorRedBlueMask);
                         Vector128<byte> output = Sse2.Subtract(input.AsByte(), i);
                         Sse2.Store((byte*)pos, output);
                     }
@@ -460,8 +475,6 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span<uint> pixelData
             {
                 Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
                 Vector128<int> multsb2 = MkCst16(Cst5b(m.RedToBlue), 0);
-                var maskalphagreen = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-                byte shufflemask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
                 fixed (uint* src = pixelData)
                 {
                     int idx;
@@ -469,9 +482,9 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span<uint> pixelData
                     {
                         uint* pos = src + idx;
                         Vector128<uint> input = Sse2.LoadVector128(pos);
-                        Vector128<byte> a = Sse2.And(input.AsByte(), maskalphagreen);
-                        Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), shufflemask);
-                        Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), shufflemask);
+                        Vector128<byte> a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask);
+                        Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask);
+                        Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask);
                         Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
                         Vector128<byte> e = Sse2.Add(input.AsByte(), d.AsByte());
                         Vector128<short> f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
index 713fc79194..abb7274472 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
@@ -36,6 +36,22 @@ internal static unsafe class PredictorEncoder
 
         private const int PredLowEffort = 11;
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<byte> CollectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
+
+        private static readonly Vector128<byte> CollectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
+
+        private static readonly Vector128<byte> CollectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+        private static readonly Vector128<byte> CollectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+
+        private static readonly Vector128<byte> CollectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+
+        private static readonly Vector128<byte> CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
+
+        private static readonly Vector128<byte> CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
+#endif
+
         /// <summary>
         /// Finds the best predictor for each tile, and converts the image to residuals
         /// with respect to predictions. If nearLosslessQuality &lt; 100, applies
@@ -1039,9 +1055,6 @@ private static void CollectColorRedTransforms(Span<uint> bgra, int stride, int t
             if (Sse41.IsSupported)
             {
                 var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
-                var maskgreen = Vector128.Create(0x00ff00);
-                var mask = Vector128.Create((short)0xff);
-
                 const int span = 8;
                 Span<ushort> values = stackalloc ushort[span];
                 for (int y = 0; y < tileHeight; y++)
@@ -1057,15 +1070,15 @@ private static void CollectColorRedTransforms(Span<uint> bgra, int stride, int t
                             uint* input1Idx = src + x + (span / 2);
                             Vector128<byte> input0 = Sse2.LoadVector128((ushort*)input0Idx).AsByte();
                             Vector128<byte> input1 = Sse2.LoadVector128((ushort*)input1Idx).AsByte();
-                            Vector128<byte> g0 = Sse2.And(input0, maskgreen.AsByte()); // 0 0  | g 0
-                            Vector128<byte> g1 = Sse2.And(input1, maskgreen.AsByte());
+                            Vector128<byte> g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0  | g 0
+                            Vector128<byte> g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask);
                             Vector128<ushort> g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
                             Vector128<int> a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
                             Vector128<int> a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
                             Vector128<ushort> a = Sse41.PackUnsignedSaturate(a0, a1); // x r
                             Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
                             Vector128<byte> c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
-                            Vector128<byte> d = Sse2.And(c, mask.AsByte()); // 0 r'
+                            Vector128<byte> d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r'
                             Sse2.Store(dst, d.AsUInt16());
                             for (int i = 0; i < span; i++)
                             {
@@ -1113,12 +1126,6 @@ private static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int
                 Span<ushort> values = stackalloc ushort[span];
                 var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
                 var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
-                var maskgreen = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-                var maskgreenblue = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-                var maskblue = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
-                var shufflerLow = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
-                var shufflerHigh = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
-
                 for (int y = 0; y < tileHeight; y++)
                 {
                     Span<uint> srcSpan = bgra.Slice(y * stride);
@@ -1132,18 +1139,18 @@ private static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int
                             uint* input1Idx = src + x + (span / 2);
                             Vector128<byte> input0 = Sse2.LoadVector128((ushort*)input0Idx).AsByte();
                             Vector128<byte> input1 = Sse2.LoadVector128((ushort*)input1Idx).AsByte();
-                            Vector128<byte> r0 = Ssse3.Shuffle(input0, shufflerLow);
-                            Vector128<byte> r1 = Ssse3.Shuffle(input1, shufflerHigh);
+                            Vector128<byte> r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask);
+                            Vector128<byte> r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask);
                             Vector128<byte> r = Sse2.Or(r0, r1);
-                            Vector128<byte> gb0 = Sse2.And(input0, maskgreenblue);
-                            Vector128<byte> gb1 = Sse2.And(input1, maskgreenblue);
+                            Vector128<byte> gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask);
+                            Vector128<byte> gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask);
                             Vector128<ushort> gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
-                            Vector128<byte> g = Sse2.And(gb.AsByte(), maskgreen);
+                            Vector128<byte> g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask);
                             Vector128<short> a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
                             Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
                             Vector128<byte> c = Sse2.Subtract(gb.AsByte(), b.AsByte());
                             Vector128<byte> d = Sse2.Subtract(c, a.AsByte());
-                            Vector128<byte> e = Sse2.And(d, maskblue);
+                            Vector128<byte> e = Sse2.And(d, CollectColorBlueTransformsBlueMask);
                             Sse2.Store(dst, e.AsUInt16());
                             for (int i = 0; i < span; i++)
                             {
diff --git a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
index d6e8d0a068..4251af7428 100644
--- a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
+++ b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
@@ -16,6 +16,16 @@ namespace SixLabors.ImageSharp.Formats.Webp
     /// </summary>
     internal static class WebpCommonUtils
     {
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector256<byte> AlphaMaskVector256 = Vector256.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
+
+        private static readonly Vector256<byte> All0x80Vector256 = Vector256.Create((byte)0x80).AsByte();
+
+        private static readonly Vector128<byte> AlphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
+
+        private static readonly Vector128<byte> All0x80 = Vector128.Create((byte)0x80).AsByte();
+#endif
+
         /// <summary>
         /// Checks if the pixel row is not opaque.
         /// </summary>
@@ -27,11 +37,6 @@ public static unsafe bool CheckNonOpaque(Span<Bgra32> row)
             if (Avx2.IsSupported)
             {
                 ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
-                var alphaMaskVector256 = Vector256.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
-                Vector256<byte> all0x80Vector256 = Vector256.Create((byte)0x80).AsByte();
-                var alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
-                Vector128<byte> all0x80 = Vector128.Create((byte)0x80).AsByte();
-
                 int i = 0;
                 int length = (row.Length * 4) - 3;
                 fixed (byte* src = rowBytes)
@@ -42,14 +47,14 @@ public static unsafe bool CheckNonOpaque(Span<Bgra32> row)
                         Vector256<byte> a1 = Avx.LoadVector256(src + i + 32).AsByte();
                         Vector256<byte> a2 = Avx.LoadVector256(src + i + 64).AsByte();
                         Vector256<byte> a3 = Avx.LoadVector256(src + i + 96).AsByte();
-                        Vector256<int> b0 = Avx2.And(a0, alphaMaskVector256).AsInt32();
-                        Vector256<int> b1 = Avx2.And(a1, alphaMaskVector256).AsInt32();
-                        Vector256<int> b2 = Avx2.And(a2, alphaMaskVector256).AsInt32();
-                        Vector256<int> b3 = Avx2.And(a3, alphaMaskVector256).AsInt32();
+                        Vector256<int> b0 = Avx2.And(a0, AlphaMaskVector256).AsInt32();
+                        Vector256<int> b1 = Avx2.And(a1, AlphaMaskVector256).AsInt32();
+                        Vector256<int> b2 = Avx2.And(a2, AlphaMaskVector256).AsInt32();
+                        Vector256<int> b3 = Avx2.And(a3, AlphaMaskVector256).AsInt32();
                         Vector256<short> c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16();
                         Vector256<short> c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16();
                         Vector256<byte> d = Avx2.PackSignedSaturate(c0, c1).AsByte();
-                        Vector256<byte> bits = Avx2.CompareEqual(d, all0x80Vector256);
+                        Vector256<byte> bits = Avx2.CompareEqual(d, All0x80Vector256);
                         int mask = Avx2.MoveMask(bits);
                         if (mask != -1)
                         {
@@ -59,7 +64,7 @@ public static unsafe bool CheckNonOpaque(Span<Bgra32> row)
 
                     for (; i + 64 <= length; i += 64)
                     {
-                        if (IsNoneOpaque64Bytes(src, i, alphaMask, all0x80))
+                        if (IsNoneOpaque64Bytes(src, i))
                         {
                             return true;
                         }
@@ -67,7 +72,7 @@ public static unsafe bool CheckNonOpaque(Span<Bgra32> row)
 
                     for (; i + 32 <= length; i += 32)
                     {
-                        if (IsNoneOpaque32Bytes(src, i, alphaMask, all0x80))
+                        if (IsNoneOpaque32Bytes(src, i))
                         {
                             return true;
                         }
@@ -85,16 +90,13 @@ public static unsafe bool CheckNonOpaque(Span<Bgra32> row)
             else if (Sse2.IsSupported)
             {
                 ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
-                var alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
-                Vector128<byte> all0x80 = Vector128.Create((byte)0x80).AsByte();
-
                 int i = 0;
                 int length = (row.Length * 4) - 3;
                 fixed (byte* src = rowBytes)
                 {
                     for (; i + 64 <= length; i += 64)
                     {
-                        if (IsNoneOpaque64Bytes(src, i, alphaMask, all0x80))
+                        if (IsNoneOpaque64Bytes(src, i))
                         {
                             return true;
                         }
@@ -102,7 +104,7 @@ public static unsafe bool CheckNonOpaque(Span<Bgra32> row)
 
                     for (; i + 32 <= length; i += 32)
                     {
-                        if (IsNoneOpaque32Bytes(src, i, alphaMask, all0x80))
+                        if (IsNoneOpaque32Bytes(src, i))
                         {
                             return true;
                         }
@@ -133,20 +135,20 @@ public static unsafe bool CheckNonOpaque(Span<Bgra32> row)
         }
 
 #if SUPPORTS_RUNTIME_INTRINSICS
-        private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i, Vector128<byte> alphaMask, Vector128<byte> all0x80)
+        private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i)
         {
             Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
             Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
             Vector128<byte> a2 = Sse2.LoadVector128(src + i + 32).AsByte();
             Vector128<byte> a3 = Sse2.LoadVector128(src + i + 48).AsByte();
-            Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
-            Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
-            Vector128<int> b2 = Sse2.And(a2, alphaMask).AsInt32();
-            Vector128<int> b3 = Sse2.And(a3, alphaMask).AsInt32();
+            Vector128<int> b0 = Sse2.And(a0, AlphaMask).AsInt32();
+            Vector128<int> b1 = Sse2.And(a1, AlphaMask).AsInt32();
+            Vector128<int> b2 = Sse2.And(a2, AlphaMask).AsInt32();
+            Vector128<int> b3 = Sse2.And(a3, AlphaMask).AsInt32();
             Vector128<short> c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16();
             Vector128<short> c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16();
             Vector128<byte> d = Sse2.PackSignedSaturate(c0, c1).AsByte();
-            Vector128<byte> bits = Sse2.CompareEqual(d, all0x80);
+            Vector128<byte> bits = Sse2.CompareEqual(d, All0x80);
             int mask = Sse2.MoveMask(bits);
             if (mask != 0xFFFF)
             {
@@ -156,15 +158,15 @@ private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i, Vector128<byte>
             return false;
         }
 
-        private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i, Vector128<byte> alphaMask, Vector128<byte> all0x80)
+        private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i)
         {
             Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
             Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
-            Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
-            Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
+            Vector128<int> b0 = Sse2.And(a0, AlphaMask).AsInt32();
+            Vector128<int> b1 = Sse2.And(a1, AlphaMask).AsInt32();
             Vector128<short> c = Sse2.PackSignedSaturate(b0, b1).AsInt16();
             Vector128<byte> d = Sse2.PackSignedSaturate(c, c).AsByte();
-            Vector128<byte> bits = Sse2.CompareEqual(d, all0x80);
+            Vector128<byte> bits = Sse2.CompareEqual(d, All0x80);
             int mask = Sse2.MoveMask(bits);
             if (mask != 0xFFFF)
             {

From e51f5008c3a53f203d0d9f21957146f95a6bf17b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 31 Oct 2021 16:51:37 +0100
Subject: [PATCH 11/85] Add AggressiveInlining to LevelCosts

---
 src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs
index 2962ebbabc..4eeeedd376 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs
@@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Runtime.CompilerServices;
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
@@ -151,6 +152,7 @@ public int GetResidualCost(int ctx0)
             return cost;
         }
 
+        [MethodImpl(InliningOptions.ShortMethod)]
         private static int LevelCost(Span<ushort> table, int level)
             => WebpLookupTables.Vp8LevelFixedCosts[level] + table[level > WebpConstants.MaxVariableLevel ? WebpConstants.MaxVariableLevel : level];
 

From e4352b9e0bcb160732fa63b88e1bd7dcf05c0dd6 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 31 Oct 2021 19:29:59 +0100
Subject: [PATCH 12/85] Use byte arrays instead of Dictionary's for lookups

---
 .../Formats/Webp/Lossy/LossyUtils.cs          |  48 ++--
 .../Formats/Webp/WebpLookupTables.cs          | 243 +++++++++++++++---
 2 files changed, 234 insertions(+), 57 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 1584237b0c..1a6ace16fa 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -934,11 +934,11 @@ private static void DoFilter2(Span<byte> p, int offset, int step)
             int p0 = p[offset - step];
             int q0 = p[offset];
             int q1 = p[offset + step];
-            int a = (3 * (q0 - p0)) + WebpLookupTables.Sclip1[p1 - q1];
-            int a1 = WebpLookupTables.Sclip2[(a + 4) >> 3];
-            int a2 = WebpLookupTables.Sclip2[(a + 3) >> 3];
-            p[offset - step] = WebpLookupTables.Clip1[p0 + a2];
-            p[offset] = WebpLookupTables.Clip1[q0 - a1];
+            int a = (3 * (q0 - p0)) + WebpLookupTables.Sclip1[p1 - q1 + 1020];
+            int a1 = WebpLookupTables.Sclip2[((a + 4) >> 3) + 112];
+            int a2 = WebpLookupTables.Sclip2[((a + 3) >> 3) + 112];
+            p[offset - step] = WebpLookupTables.Clip1[p0 + a2 + 255];
+            p[offset] = WebpLookupTables.Clip1[q0 - a1 + 255];
         }
 
         private static void DoFilter4(Span<byte> p, int offset, int step)
@@ -950,13 +950,13 @@ private static void DoFilter4(Span<byte> p, int offset, int step)
             int q0 = p[offset];
             int q1 = p[offset + step];
             int a = 3 * (q0 - p0);
-            int a1 = WebpLookupTables.Sclip2[(a + 4) >> 3];
-            int a2 = WebpLookupTables.Sclip2[(a + 3) >> 3];
+            int a1 = WebpLookupTables.Sclip2[((a + 4) >> 3) + 112];
+            int a2 = WebpLookupTables.Sclip2[((a + 3) >> 3) + 112];
             int a3 = (a1 + 1) >> 1;
-            p[offsetMinus2Step] = WebpLookupTables.Clip1[p1 + a3];
-            p[offset - step] = WebpLookupTables.Clip1[p0 + a2];
-            p[offset] = WebpLookupTables.Clip1[q0 - a1];
-            p[offset + step] = WebpLookupTables.Clip1[q1 - a3];
+            p[offsetMinus2Step] = WebpLookupTables.Clip1[p1 + a3 + 255];
+            p[offset - step] = WebpLookupTables.Clip1[p0 + a2 + 255];
+            p[offset] = WebpLookupTables.Clip1[q0 - a1 + 255];
+            p[offset + step] = WebpLookupTables.Clip1[q1 - a3 + 255];
         }
 
         private static void DoFilter6(Span<byte> p, int offset, int step)
@@ -971,18 +971,18 @@ private static void DoFilter6(Span<byte> p, int offset, int step)
             int q0 = p[offset];
             int q1 = p[offset + step];
             int q2 = p[offset + step2];
-            int a = WebpLookupTables.Sclip1[(3 * (q0 - p0)) + WebpLookupTables.Sclip1[p1 - q1]];
+            int a = WebpLookupTables.Sclip1[(3 * (q0 - p0)) + WebpLookupTables.Sclip1[p1 - q1 + 1020] + 1020];
 
             // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
             int a1 = ((27 * a) + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
             int a2 = ((18 * a) + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
             int a3 = ((9 * a) + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
-            p[offset - step3] = WebpLookupTables.Clip1[p2 + a3];
-            p[offset - step2] = WebpLookupTables.Clip1[p1 + a2];
-            p[offsetMinusStep] = WebpLookupTables.Clip1[p0 + a1];
-            p[offset] = WebpLookupTables.Clip1[q0 - a1];
-            p[offset + step] = WebpLookupTables.Clip1[q1 - a2];
-            p[offset + step2] = WebpLookupTables.Clip1[q2 - a3];
+            p[offset - step3] = WebpLookupTables.Clip1[p2 + a3 + 255];
+            p[offset - step2] = WebpLookupTables.Clip1[p1 + a2 + 255];
+            p[offsetMinusStep] = WebpLookupTables.Clip1[p0 + a1 + 255];
+            p[offset] = WebpLookupTables.Clip1[q0 - a1 + 255];
+            p[offset + step] = WebpLookupTables.Clip1[q1 - a2 + 255];
+            p[offset + step2] = WebpLookupTables.Clip1[q2 - a3 + 255];
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -992,7 +992,7 @@ private static bool NeedsFilter(Span<byte> p, int offset, int step, int t)
             int p0 = p[offset - step];
             int q0 = p[offset];
             int q1 = p[offset + step];
-            return (4 * WebpLookupTables.Abs0[p0 - q0]) + WebpLookupTables.Abs0[p1 - q1] <= t;
+            return (4 * WebpLookupTables.Abs0[p0 - q0 + 255]) + WebpLookupTables.Abs0[p1 - q1 + 255] <= t;
         }
 
         private static bool NeedsFilter2(Span<byte> p, int offset, int step, int t, int it)
@@ -1007,14 +1007,14 @@ private static bool NeedsFilter2(Span<byte> p, int offset, int step, int t, int
             int q1 = p[offset + step];
             int q2 = p[offset + step2];
             int q3 = p[offset + step3];
-            if ((4 * WebpLookupTables.Abs0[p0 - q0]) + WebpLookupTables.Abs0[p1 - q1] > t)
+            if ((4 * WebpLookupTables.Abs0[p0 - q0 + 255]) + WebpLookupTables.Abs0[p1 - q1 + 255] > t)
             {
                 return false;
             }
 
-            return WebpLookupTables.Abs0[p3 - p2] <= it && WebpLookupTables.Abs0[p2 - p1] <= it &&
-                   WebpLookupTables.Abs0[p1 - p0] <= it && WebpLookupTables.Abs0[q3 - q2] <= it &&
-                   WebpLookupTables.Abs0[q2 - q1] <= it && WebpLookupTables.Abs0[q1 - q0] <= it;
+            return WebpLookupTables.Abs0[p3 - p2 + 255] <= it && WebpLookupTables.Abs0[p2 - p1 + 255] <= it &&
+                   WebpLookupTables.Abs0[p1 - p0 + 255] <= it && WebpLookupTables.Abs0[q3 - q2 + 255] <= it &&
+                   WebpLookupTables.Abs0[q2 - q1 + 255] <= it && WebpLookupTables.Abs0[q1 - q0 + 255] <= it;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -1024,7 +1024,7 @@ private static bool Hev(Span<byte> p, int offset, int step, int thresh)
             int p0 = p[offset - step];
             int q0 = p[offset];
             int q1 = p[offset + step];
-            return WebpLookupTables.Abs0[p1 - p0] > thresh || WebpLookupTables.Abs0[q1 - q0] > thresh;
+            return WebpLookupTables.Abs0[p1 - p0 + 255] > thresh || WebpLookupTables.Abs0[q1 - q0 + 255] > thresh;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
index 57b5739c79..768f4a8da3 100644
--- a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
+++ b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
@@ -2,21 +2,12 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Collections.Generic;
 
 namespace SixLabors.ImageSharp.Formats.Webp
 {
 #pragma warning disable SA1201 // Elements should appear in the correct order
     internal static class WebpLookupTables
     {
-        public static readonly Dictionary<int, byte> Abs0;
-
-        public static readonly Dictionary<int, byte> Clip1;
-
-        public static readonly Dictionary<int, sbyte> Sclip1;
-
-        public static readonly Dictionary<int, sbyte> Sclip2;
-
         public static readonly byte[,][] ModesProba = new byte[10, 10][];
 
         public static readonly ushort[] GammaToLinearTab = new ushort[256];
@@ -54,6 +45,216 @@ internal static class WebpLookupTables
             8 + (0 * WebpConstants.Bps), 12 + (0 * WebpConstants.Bps), 8 + (4 * WebpConstants.Bps), 12 + (4 * WebpConstants.Bps) // V
         };
 
+        public static readonly byte[] Abs0 =
+        {
+            0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0, 0xef,
+            0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8, 0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde,
+            0xdd, 0xdc, 0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0, 0xcf, 0xce, 0xcd,
+            0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc,
+            0xbb, 0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab,
+            0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a,
+            0x99, 0x98, 0x97, 0x96, 0x95, 0x94, 0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89,
+            0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c, 0x7b, 0x7a, 0x79, 0x78,
+            0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70, 0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67,
+            0x66, 0x65, 0x64, 0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58, 0x57, 0x56,
+            0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45,
+            0x44, 0x43, 0x42, 0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
+            0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23,
+            0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12,
+            0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21,
+            0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+            0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43,
+            0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54,
+            0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65,
+            0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+            0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+            0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+            0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+            0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
+            0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
+            0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc,
+            0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed,
+            0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe,
+            0xff
+        };
+
+        public static readonly sbyte[] Sclip1 =
+        {
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -127, -126, -125, -124, -123, -122, -121, -120,
+            -119, -118, -117, -116, -115, -114, -113, -112, -111, -110, -109, -108, -107, -106, -105, -104, -103,
+            -102, -101, -100, -99, -98, -97, -96, -95, -94, -93, -92, -91, -90, -89, -88, -87, -86, -85, -84, -83,
+            -82, -81, -80, -79, -78, -77, -76, -75, -74, -73, -72, -71, -70, -69, -68, -67, -66, -65, -64, -63, -62,
+            -61, -60, -59, -58, -57, -56, -55, -54, -53, -52, -51, -50, -49, -48, -47, -46, -45, -44, -43, -42, -41,
+            -40, -39, -38, -37, -36, -35, -34, -33, -32, -31, -30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20,
+            -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5,
+            6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+            59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+            85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+            109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127
+        };
+
+        public static readonly sbyte[] Sclip2 =
+        {
+            -112, -111, -110, -109, -108, -107, -106, -105, -104, -103, -102, -101, -100, -99, -98, -97, -96, -95,
+            -94, -93, -92, -91, -90, -89, -88, -87, -86, -85, -84, -83, -82, -81, -80, -79, -78, -77, -76, -75, -74,
+            -73, -72, -71, -70, -69, -68, -67, -66, -65, -64, -63, -62, -61, -60, -59, -58, -57, -56, -55, -54, -53,
+            -52, -51, -50, -49, -48, -47, -46, -45, -44, -43, -42, -41, -40, -39, -38, -37, -36, -35, -34, -33, -32,
+            -31, -30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11,
+            -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+            44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+            70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+            96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112
+        };
+
+        public static readonly byte[] Clip1 =
+        {
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21,
+            0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+            0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43,
+            0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54,
+            0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65,
+            0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+            0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+            0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+            0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+            0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
+            0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
+            0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc,
+            0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed,
+            0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff
+        };
+
         // fixed costs for coding levels, deduce from the coding tree.
         // This is only the part that doesn't depend on the probability state.
         public static readonly short[] Vp8LevelFixedCosts =
@@ -1233,30 +1434,6 @@ static WebpLookupTables()
                 LinearToGammaTab[v] = (int)((255.0d * Math.Pow(scale * v, 1.0d / WebpConstants.Gamma)) + .5);
             }
 
-            Abs0 = new Dictionary<int, byte>();
-            for (int i = -255; i <= 255; i++)
-            {
-                Abs0[i] = (byte)((i < 0) ? -i : i);
-            }
-
-            Clip1 = new Dictionary<int, byte>();
-            for (int i = -255; i <= 255 + 255; i++)
-            {
-                Clip1[i] = (byte)(i < 0 ? 0 : i > 255 ? 255 : i);
-            }
-
-            Sclip1 = new Dictionary<int, sbyte>();
-            for (int i = -1020; i <= 1020; i++)
-            {
-                Sclip1[i] = (sbyte)(i < -128 ? -128 : i > 127 ? 127 : i);
-            }
-
-            Sclip2 = new Dictionary<int, sbyte>();
-            for (int i = -112; i <= 112; i++)
-            {
-                Sclip2[i] = (sbyte)(i < -16 ? -16 : i > 15 ? 15 : i);
-            }
-
             InitializeModesProbabilities();
             InitializeFixedCostsI4();
         }

From 414e4a861db47a81482786abd9ebe6fff3748d58 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 31 Oct 2021 20:00:39 +0100
Subject: [PATCH 13/85] Fix Sclip2 values

---
 .../Formats/Webp/WebpLookupTables.cs          | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
index 768f4a8da3..98cf3029fa 100644
--- a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
+++ b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
@@ -193,16 +193,16 @@ internal static class WebpLookupTables
 
         public static readonly sbyte[] Sclip2 =
         {
-            -112, -111, -110, -109, -108, -107, -106, -105, -104, -103, -102, -101, -100, -99, -98, -97, -96, -95,
-            -94, -93, -92, -91, -90, -89, -88, -87, -86, -85, -84, -83, -82, -81, -80, -79, -78, -77, -76, -75, -74,
-            -73, -72, -71, -70, -69, -68, -67, -66, -65, -64, -63, -62, -61, -60, -59, -58, -57, -56, -55, -54, -53,
-            -52, -51, -50, -49, -48, -47, -46, -45, -44, -43, -42, -41, -40, -39, -38, -37, -36, -35, -34, -33, -32,
-            -31, -30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11,
-            -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
-            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-            44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
-            70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-            96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -15, -14, -13, -12, -11, -10, -9, -8,
+            -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
         };
 
         public static readonly byte[] Clip1 =

From ef90575a119335314ea69c4cbd556469d91f032f Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 1 Nov 2021 21:42:32 +1100
Subject: [PATCH 14/85] Revert "Use RgbaVector for color backing"

This reverts commit 257ff1929e341e5b1af94d9adf557e5296ece957.
---
 src/ImageSharp/Color/Color.Conversions.cs     | 87 +++----------------
 src/ImageSharp/Color/Color.cs                 | 74 ++++++++--------
 .../Color/ColorTests.CastFrom.cs              | 17 +---
 .../Color/ColorTests.ConstructFrom.cs         |  4 +-
 4 files changed, 57 insertions(+), 125 deletions(-)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index abcb54b807..0455fd26a4 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -17,90 +17,56 @@ public readonly partial struct Color
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba64"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba64 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromRgba64(pixel);
-            this.data = vector;
-        }
+        public Color(Rgba64 pixel) => this.data = pixel;
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba32 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromRgba32(pixel);
-            this.data = vector;
-        }
+        public Color(Rgba32 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Argb32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Argb32 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromArgb32(pixel);
-            this.data = vector;
-        }
+        public Color(Argb32 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgra32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgra32 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromBgra32(pixel);
-            this.data = vector;
-        }
+        public Color(Bgra32 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgb24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgb24 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromRgb24(pixel);
-            this.data = vector;
-        }
+        public Color(Rgb24 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgr24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgr24 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromBgr24(pixel);
-            this.data = vector;
-        }
+        public Color(Bgr24 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="vector">The <see cref="Vector4"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Vector4 vector)
-        {
-            vector = Numerics.Clamp(vector, Vector4.Zero, Vector4.One);
-            this.data = new RgbaVector(vector.X, vector.Y, vector.Z, vector.W);
-        }
+        public Color(Vector4 vector) => this.data = new Rgba64(vector);
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Vector4"/>.
         /// </summary>
         /// <param name="color">The <see cref="Color"/>.</param>
         /// <returns>The <see cref="Vector4"/>.</returns>
-        public static explicit operator Vector4(Color color) => color.data.ToScaledVector4();
+        public static explicit operator Vector4(Color color) => color.data.ToVector4();
 
         /// <summary>
         /// Converts an <see cref="Vector4"/> to <see cref="Color"/>.
@@ -108,47 +74,22 @@ public Color(Vector4 vector)
         /// <param name="source">The <see cref="Vector4"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static explicit operator Color(Vector4 source) => new(source);
+        public static explicit operator Color(Vector4 source) => new Color(source);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgba32 ToRgba32()
-        {
-            Rgba32 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Rgba32 ToRgba32() => this.data.ToRgba32();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgra32 ToBgra32()
-        {
-            Bgra32 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Bgra32 ToBgra32() => this.data.ToBgra32();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Argb32 ToArgb32()
-        {
-            Argb32 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Argb32 ToArgb32() => this.data.ToArgb32();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgb24 ToRgb24()
-        {
-            Rgb24 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Rgb24 ToRgb24() => this.data.ToRgb24();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgr24 ToBgr24()
-        {
-            Bgr24 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Bgr24 ToBgr24() => this.data.ToBgr24();
 
         [MethodImpl(InliningOptions.ShortMethod)]
         internal Vector4 ToVector4() => this.data.ToVector4();
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index 9a4df4e629..d5eedc160b 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -20,22 +20,26 @@ namespace SixLabors.ImageSharp
     /// </remarks>
     public readonly partial struct Color : IEquatable<Color>
     {
-        private readonly RgbaVector data;
+        private readonly Rgba64 data;
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b, byte a)
         {
-            RgbaVector vector = default;
-            vector.FromRgba32(new(r, g, b, a));
-            this.data = vector;
+            this.data = new Rgba64(
+                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(a));
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b)
         {
-            RgbaVector vector = default;
-            vector.FromRgba32(new(r, g, b));
-            this.data = vector;
+            this.data = new Rgba64(
+                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
+                ushort.MaxValue);
         }
 
         /// <summary>
@@ -48,7 +52,10 @@ private Color(byte r, byte g, byte b)
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator ==(Color left, Color right) => left.Equals(right);
+        public static bool operator ==(Color left, Color right)
+        {
+            return left.Equals(right);
+        }
 
         /// <summary>
         /// Checks whether two <see cref="Color"/> structures are equal.
@@ -60,7 +67,10 @@ private Color(byte r, byte g, byte b)
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator !=(Color left, Color right) => !left.Equals(right);
+        public static bool operator !=(Color left, Color right)
+        {
+            return !left.Equals(right);
+        }
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGBA bytes.
@@ -71,7 +81,7 @@ private Color(byte r, byte g, byte b)
         /// <param name="a">The alpha component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgba(byte r, byte g, byte b, byte a) => new(r, g, b, a);
+        public static Color FromRgba(byte r, byte g, byte b, byte a) => new Color(r, g, b, a);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGB bytes.
@@ -81,17 +91,7 @@ private Color(byte r, byte g, byte b)
         /// <param name="b">The blue component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgb(byte r, byte g, byte b) => new(r, g, b);
-
-        /// <summary>
-        /// Creates a <see cref="Color"/> from the given <typeparamref name="TPixel"/>.
-        /// </summary>
-        /// <param name="pixel">The pixel to convert from.</param>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <returns>The <see cref="Color"/>.</returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromPixel<TPixel>(TPixel pixel)
-            where TPixel : unmanaged, IPixel<TPixel> => new(pixel.ToScaledVector4());
+        public static Color FromRgb(byte r, byte g, byte b) => new Color(r, g, b);
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct
@@ -207,18 +207,13 @@ public Color WithAlpha(float alpha)
         /// </summary>
         /// <returns>A hexadecimal string representation of the value.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public string ToHex()
-        {
-            Rgba32 rgba = default;
-            this.data.ToRgba32(ref rgba);
-            return rgba.ToHex();
-        }
+        public string ToHex() => this.data.ToRgba32().ToHex();
 
         /// <inheritdoc />
         public override string ToString() => this.ToHex();
 
         /// <summary>
-        /// Converts the color instance to a specified <typeparamref name="TPixel"/> type.
+        /// Converts the color instance to a specified <see cref="IPixel{TSelf}"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <returns>The pixel value.</returns>
@@ -227,12 +222,12 @@ public TPixel ToPixel<TPixel>()
             where TPixel : unmanaged, IPixel<TPixel>
         {
             TPixel pixel = default;
-            pixel.FromScaledVector4(this.data.ToScaledVector4());
+            pixel.FromRgba64(this.data);
             return pixel;
         }
 
         /// <summary>
-        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
+        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <see cref="IPixel{TSelf}"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <param name="configuration">The configuration.</param>
@@ -245,19 +240,28 @@ public static void ToPixel<TPixel>(
             Span<TPixel> destination)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            ReadOnlySpan<RgbaVector> rgbaSpan = MemoryMarshal.Cast<Color, RgbaVector>(source);
-            PixelOperations<TPixel>.Instance.From(configuration, rgbaSpan, destination);
+            ReadOnlySpan<Rgba64> rgba64Span = MemoryMarshal.Cast<Color, Rgba64>(source);
+            PixelOperations<TPixel>.Instance.FromRgba64(configuration, rgba64Span, destination);
         }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public bool Equals(Color other) => this.data.Equals(other.data);
+        public bool Equals(Color other)
+        {
+            return this.data.PackedValue == other.data.PackedValue;
+        }
 
         /// <inheritdoc />
-        public override bool Equals(object obj) => obj is Color other && this.Equals(other);
+        public override bool Equals(object obj)
+        {
+            return obj is Color other && this.Equals(other);
+        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public override int GetHashCode() => this.data.GetHashCode();
+        public override int GetHashCode()
+        {
+            return this.data.PackedValue.GetHashCode();
+        }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
index 356ef7351e..38b94f486c 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
@@ -66,7 +66,7 @@ public void Bgra32()
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22, 231);
+                var source = new Rgb24(1, 22,  231);
 
                 // Act:
                 Color color = source;
@@ -79,7 +79,7 @@ public void Rgb24()
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22, 231);
+                var source = new Bgr24(1, 22,  231);
 
                 // Act:
                 Color color = source;
@@ -88,19 +88,6 @@ public void Bgr24()
                 Bgr24 data = color.ToPixel<Bgr24>();
                 Assert.Equal(source, data);
             }
-
-            [Fact]
-            public void TPixel()
-            {
-                var source = new RgbaVector(1, .1F, .133F, .864F);
-
-                // Act:
-                var color = Color.FromPixel(source);
-
-                // Assert:
-                RgbaVector data = color.ToPixel<RgbaVector>();
-                Assert.Equal(source, data);
-            }
         }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
index dd51f3a6c2..89276014b0 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
@@ -66,7 +66,7 @@ public void Bgra32()
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22, 231);
+                var source = new Rgb24(1, 22,  231);
 
                 // Act:
                 var color = new Color(source);
@@ -79,7 +79,7 @@ public void Rgb24()
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22, 231);
+                var source = new Bgr24(1, 22,  231);
 
                 // Act:
                 var color = new Color(source);

From 2ec17e7c6a31b31fafb75cfd85613681fa4125d6 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 1 Nov 2021 22:39:20 +1100
Subject: [PATCH 15/85] Use box pixel for high precision

---
 src/ImageSharp/Color/Color.Conversions.cs     | 117 +++++++++++++++---
 src/ImageSharp/Color/Color.cs                 |  77 ++++++++----
 .../Color/ColorTests.CastTo.cs                |  17 ++-
 3 files changed, 171 insertions(+), 40 deletions(-)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 0455fd26a4..424b7dcdfe 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -17,56 +17,85 @@ public readonly partial struct Color
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba64"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba64 pixel) => this.data = pixel;
+        public Color(Rgba64 pixel)
+        {
+            this.data = pixel;
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgba32 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Argb32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Argb32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Argb32 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgra32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgra32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgra32 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgb24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgb24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgb24 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgr24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgr24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgr24 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="vector">The <see cref="Vector4"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Vector4 vector) => this.data = new Rgba64(vector);
+        public Color(Vector4 vector)
+        {
+            vector = Numerics.Clamp(vector, Vector4.Zero, Vector4.One);
+            this.boxedHighPrecisionPixel = new RgbaVector(vector.X, vector.Y, vector.Z, vector.W);
+            this.data = default;
+        }
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Vector4"/>.
         /// </summary>
         /// <param name="color">The <see cref="Color"/>.</param>
         /// <returns>The <see cref="Vector4"/>.</returns>
-        public static explicit operator Vector4(Color color) => color.data.ToVector4();
+        public static explicit operator Vector4(Color color) => color.ToVector4();
 
         /// <summary>
         /// Converts an <see cref="Vector4"/> to <see cref="Color"/>.
@@ -74,24 +103,82 @@ public readonly partial struct Color
         /// <param name="source">The <see cref="Vector4"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static explicit operator Color(Vector4 source) => new Color(source);
+        public static explicit operator Color(Vector4 source) => new(source);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgba32 ToRgba32() => this.data.ToRgba32();
+        internal Rgba32 ToRgba32()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToRgba32();
+            }
+
+            Rgba32 value = default;
+            this.boxedHighPrecisionPixel.ToRgba32(ref value);
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgra32 ToBgra32() => this.data.ToBgra32();
+        internal Bgra32 ToBgra32()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToBgra32();
+            }
+
+            Bgra32 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Argb32 ToArgb32() => this.data.ToArgb32();
+        internal Argb32 ToArgb32()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToArgb32();
+            }
+
+            Argb32 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgb24 ToRgb24() => this.data.ToRgb24();
+        internal Rgb24 ToRgb24()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToRgb24();
+            }
+
+            Rgb24 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgr24 ToBgr24() => this.data.ToBgr24();
+        internal Bgr24 ToBgr24()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToBgr24();
+            }
+
+            Bgr24 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Vector4 ToVector4() => this.data.ToVector4();
+        internal Vector4 ToVector4()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToScaledVector4();
+            }
+
+            return this.boxedHighPrecisionPixel.ToScaledVector4();
+        }
     }
 }
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index d5eedc160b..fe66efcfb5 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -4,7 +4,6 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp
@@ -21,6 +20,7 @@ namespace SixLabors.ImageSharp
     public readonly partial struct Color : IEquatable<Color>
     {
         private readonly Rgba64 data;
+        private readonly IPixel boxedHighPrecisionPixel;
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b, byte a)
@@ -30,6 +30,8 @@ private Color(byte r, byte g, byte b, byte a)
                 ColorNumerics.UpscaleFrom8BitTo16Bit(g),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(b),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(a));
+
+            this.boxedHighPrecisionPixel = null;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -40,6 +42,15 @@ private Color(byte r, byte g, byte b)
                 ColorNumerics.UpscaleFrom8BitTo16Bit(g),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(b),
                 ushort.MaxValue);
+
+            this.boxedHighPrecisionPixel = null;
+        }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private Color(IPixel pixel)
+        {
+            this.boxedHighPrecisionPixel = pixel;
+            this.data = default;
         }
 
         /// <summary>
@@ -52,13 +63,10 @@ private Color(byte r, byte g, byte b)
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator ==(Color left, Color right)
-        {
-            return left.Equals(right);
-        }
+        public static bool operator ==(Color left, Color right) => left.Equals(right);
 
         /// <summary>
-        /// Checks whether two <see cref="Color"/> structures are equal.
+        /// Checks whether two <see cref="Color"/> structures are not equal.
         /// </summary>
         /// <param name="left">The left hand <see cref="Color"/> operand.</param>
         /// <param name="right">The right hand <see cref="Color"/> operand.</param>
@@ -67,10 +75,7 @@ private Color(byte r, byte g, byte b)
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator !=(Color left, Color right)
-        {
-            return !left.Equals(right);
-        }
+        public static bool operator !=(Color left, Color right) => !left.Equals(right);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGBA bytes.
@@ -81,7 +86,7 @@ private Color(byte r, byte g, byte b)
         /// <param name="a">The alpha component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgba(byte r, byte g, byte b, byte a) => new Color(r, g, b, a);
+        public static Color FromRgba(byte r, byte g, byte b, byte a) => new(r, g, b, a);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGB bytes.
@@ -91,7 +96,18 @@ private Color(byte r, byte g, byte b)
         /// <param name="b">The blue component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgb(byte r, byte g, byte b) => new Color(r, g, b);
+        public static Color FromRgb(byte r, byte g, byte b) => new(r, g, b);
+
+        /// <summary>
+        /// Creates a <see cref="Color"/> from the given <typeparamref name="TPixel"/>.
+        /// </summary>
+        /// <param name="pixel">The pixel to convert from.</param>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <returns>The <see cref="Color"/>.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static Color FromPixel<TPixel>(TPixel pixel)
+            where TPixel : unmanaged, IPixel<TPixel>
+            => new(pixel);
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct
@@ -213,7 +229,7 @@ public Color WithAlpha(float alpha)
         public override string ToString() => this.ToHex();
 
         /// <summary>
-        /// Converts the color instance to a specified <see cref="IPixel{TSelf}"/> type.
+        /// Converts the color instance to a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <returns>The pixel value.</returns>
@@ -221,13 +237,18 @@ public Color WithAlpha(float alpha)
         public TPixel ToPixel<TPixel>()
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            TPixel pixel = default;
+            if (this.boxedHighPrecisionPixel is TPixel pixel)
+            {
+                return pixel;
+            }
+
+            pixel = default;
             pixel.FromRgba64(this.data);
             return pixel;
         }
 
         /// <summary>
-        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <see cref="IPixel{TSelf}"/> type.
+        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <param name="configuration">The configuration.</param>
@@ -240,28 +261,38 @@ public static void ToPixel<TPixel>(
             Span<TPixel> destination)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            ReadOnlySpan<Rgba64> rgba64Span = MemoryMarshal.Cast<Color, Rgba64>(source);
-            PixelOperations<TPixel>.Instance.FromRgba64(configuration, rgba64Span, destination);
+            Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
+            for (int i = 0; i < source.Length; i++)
+            {
+                destination[i] = source[i].ToPixel<TPixel>();
+            }
         }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
         public bool Equals(Color other)
         {
-            return this.data.PackedValue == other.data.PackedValue;
+            if (this.boxedHighPrecisionPixel is null && other.boxedHighPrecisionPixel is null)
+            {
+                return this.data.PackedValue == other.data.PackedValue;
+            }
+
+            return this.ToVector4().Equals(other.ToVector4());
         }
 
         /// <inheritdoc />
-        public override bool Equals(object obj)
-        {
-            return obj is Color other && this.Equals(other);
-        }
+        public override bool Equals(object obj) => obj is Color other && this.Equals(other);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
         public override int GetHashCode()
         {
-            return this.data.PackedValue.GetHashCode();
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.PackedValue.GetHashCode();
+            }
+
+            return this.boxedHighPrecisionPixel.GetHashCode();
         }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
index ee1820de77..d3f3cf126e 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
@@ -66,7 +66,7 @@ public void Bgra32()
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22,  231);
+                var source = new Rgb24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);
@@ -79,7 +79,7 @@ public void Rgb24()
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22,  231);
+                var source = new Bgr24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);
@@ -88,6 +88,19 @@ public void Bgr24()
                 Bgr24 data = color;
                 Assert.Equal(source, data);
             }
+
+            [Fact]
+            public void TPixel()
+            {
+                var source = new RgbaVector(1, .1F, .133F, .864F);
+
+                // Act:
+                var color = Color.FromPixel(source);
+
+                // Assert:
+                RgbaVector data = color.ToPixel<RgbaVector>();
+                Assert.Equal(source, data);
+            }
         }
     }
 }

From 67fd2d0427290e6a76eec0e49fb133986efbf3b6 Mon Sep 17 00:00:00 2001
From: Brian Popow <38701097+brianpopow@users.noreply.github.com>
Date: Mon, 1 Nov 2021 13:07:39 +0100
Subject: [PATCH 16/85] Use ReadOnlySpan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Günther Foidl <gue@korporal.at>
---
 src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs | 3 ++-
 src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
index abb7274472..c6dc6b8b23 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
@@ -17,7 +17,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
     /// </summary>
     internal static unsafe class PredictorEncoder
     {
-        private static readonly sbyte[] DeltaLut = { 16, 16, 8, 4, 2, 2, 2 };
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<sbyte> DeltaLut => new sbyte[] { 16, 16, 8, 4, 2, 2, 2 };
 
         private static readonly sbyte[][] Offset =
         {
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index c46e7193f2..1a9036ec95 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -85,7 +85,8 @@ internal class Vp8LEncoder : IDisposable
 
         private const int PaletteInvSize = 1 << PaletteInvSizeBits;
 
-        private static readonly byte[] Order = { 1, 2, 0, 3 };
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<byte> Order => new byte[] { 1, 2, 0, 3 };
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Vp8LEncoder"/> class.

From 86f4903c827635170e43cae57730bea4b951d6c7 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 1 Nov 2021 13:35:39 +0100
Subject: [PATCH 17/85] Fix build errors

---
 src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs | 6 +++---
 src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs      | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
index c6dc6b8b23..89c930561c 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
@@ -17,9 +17,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
     /// </summary>
     internal static unsafe class PredictorEncoder
     {
-        // This uses C#'s compiler optimization to refer to assembly's static data directly.
-        private static ReadOnlySpan<sbyte> DeltaLut => new sbyte[] { 16, 16, 8, 4, 2, 2, 2 };
-
         private static readonly sbyte[][] Offset =
         {
             new sbyte[] { 0, -1 }, new sbyte[] { 0, 1 }, new sbyte[] { -1, 0 }, new sbyte[] { 1, 0 }, new sbyte[] { -1, -1 }, new sbyte[] { -1, 1 }, new sbyte[] { 1, -1 }, new sbyte[] { 1, 1 }
@@ -53,6 +50,9 @@ internal static unsafe class PredictorEncoder
         private static readonly Vector128<byte> CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
 #endif
 
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<sbyte> DeltaLut => new sbyte[] { 16, 16, 8, 4, 2, 2, 2 };
+
         /// <summary>
         /// Finds the best predictor for each tile, and converts the image to residuals
         /// with respect to predictions. If nearLosslessQuality &lt; 100, applies
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index 1a9036ec95..6a0a3184ed 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -85,9 +85,6 @@ internal class Vp8LEncoder : IDisposable
 
         private const int PaletteInvSize = 1 << PaletteInvSizeBits;
 
-        // This uses C#'s compiler optimization to refer to assembly's static data directly.
-        private static ReadOnlySpan<byte> Order => new byte[] { 1, 2, 0, 3 };
-
         /// <summary>
         /// Initializes a new instance of the <see cref="Vp8LEncoder"/> class.
         /// </summary>
@@ -140,6 +137,9 @@ public Vp8LEncoder(
             }
         }
 
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<byte> Order => new byte[] { 1, 2, 0, 3 };
+
         /// <summary>
         /// Gets the memory for the image data as packed bgra values.
         /// </summary>

From 94df8fc1ad8833c912e19f642df78d49cca091b8 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 1 Nov 2021 14:33:46 +0100
Subject: [PATCH 18/85] Small bitreader improvements:

- Make bitmask static readonly
- Add aggresive inlining
- Change Guard to DebugGuard in ReadValue
---
 .../Formats/Webp/BitReader/Vp8LBitReader.cs          | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs b/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs
index 601336fa4b..07423e3127 100644
--- a/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs
+++ b/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs
@@ -28,7 +28,7 @@ internal class Vp8LBitReader : BitReaderBase
         /// </summary>
         private const int Wbits = 32;
 
-        private readonly uint[] bitMask =
+        private static readonly uint[] BitMask =
         {
             0,
             0x000001, 0x000003, 0x000007, 0x00000f,
@@ -125,13 +125,14 @@ public Vp8LBitReader(Stream inputStream, uint imageDataSize, MemoryAllocator mem
         /// </summary>
         /// <param name="nBits">The number of bits to read (should not exceed 16).</param>
         /// <returns>A ushort value.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public uint ReadValue(int nBits)
         {
-            Guard.MustBeGreaterThan(nBits, 0, nameof(nBits));
+            DebugGuard.MustBeGreaterThan(nBits, 0, nameof(nBits));
 
             if (!this.Eos && nBits <= Vp8LMaxNumBitRead)
             {
-                ulong val = this.PrefetchBits() & this.bitMask[nBits];
+                ulong val = this.PrefetchBits() & BitMask[nBits];
                 this.bitPos += nBits;
                 this.ShiftBytes();
                 return (uint)val;
@@ -169,6 +170,7 @@ public bool ReadBit()
         /// <summary>
         /// Advances the read buffer by 4 bytes to make room for reading next 32 bits.
         /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public void FillBitWindow()
         {
             if (this.bitPos >= Wbits)
@@ -181,7 +183,8 @@ public void FillBitWindow()
         /// Returns true if there was an attempt at reading bit past the end of the buffer.
         /// </summary>
         /// <returns>True, if end of buffer was reached.</returns>
-        public bool IsEndOfStream() => this.Eos || ((this.pos == this.len) && (this.bitPos > Lbits));
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public bool IsEndOfStream() => this.Eos || (this.pos == this.len && this.bitPos > Lbits);
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private void DoFillBitWindow() => this.ShiftBytes();
@@ -189,6 +192,7 @@ public void FillBitWindow()
         /// <summary>
         /// If not at EOS, reload up to Vp8LLbits byte-by-byte.
         /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
         private void ShiftBytes()
         {
             System.Span<byte> dataSpan = this.Data.Memory.Span;

From 7d4fd642de5f08a87318fc19058dcbd9547e488a Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 1 Nov 2021 17:20:35 +0100
Subject: [PATCH 19/85] Use helper methods to access clip tables

---
 .../Formats/Webp/Lossy/LossyUtils.cs          |  48 +-
 .../Formats/Webp/WebpLookupTables.cs          | 425 +++++++++---------
 2 files changed, 243 insertions(+), 230 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 1a6ace16fa..04ff80b2d9 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -934,11 +934,11 @@ private static void DoFilter2(Span<byte> p, int offset, int step)
             int p0 = p[offset - step];
             int q0 = p[offset];
             int q1 = p[offset + step];
-            int a = (3 * (q0 - p0)) + WebpLookupTables.Sclip1[p1 - q1 + 1020];
-            int a1 = WebpLookupTables.Sclip2[((a + 4) >> 3) + 112];
-            int a2 = WebpLookupTables.Sclip2[((a + 3) >> 3) + 112];
-            p[offset - step] = WebpLookupTables.Clip1[p0 + a2 + 255];
-            p[offset] = WebpLookupTables.Clip1[q0 - a1 + 255];
+            int a = (3 * (q0 - p0)) + WebpLookupTables.Sclip1(p1 - q1);
+            int a1 = WebpLookupTables.Sclip2((a + 4) >> 3);
+            int a2 = WebpLookupTables.Sclip2((a + 3) >> 3);
+            p[offset - step] = WebpLookupTables.Clip1(p0 + a2);
+            p[offset] = WebpLookupTables.Clip1(q0 - a1);
         }
 
         private static void DoFilter4(Span<byte> p, int offset, int step)
@@ -950,13 +950,13 @@ private static void DoFilter4(Span<byte> p, int offset, int step)
             int q0 = p[offset];
             int q1 = p[offset + step];
             int a = 3 * (q0 - p0);
-            int a1 = WebpLookupTables.Sclip2[((a + 4) >> 3) + 112];
-            int a2 = WebpLookupTables.Sclip2[((a + 3) >> 3) + 112];
+            int a1 = WebpLookupTables.Sclip2((a + 4) >> 3);
+            int a2 = WebpLookupTables.Sclip2((a + 3) >> 3);
             int a3 = (a1 + 1) >> 1;
-            p[offsetMinus2Step] = WebpLookupTables.Clip1[p1 + a3 + 255];
-            p[offset - step] = WebpLookupTables.Clip1[p0 + a2 + 255];
-            p[offset] = WebpLookupTables.Clip1[q0 - a1 + 255];
-            p[offset + step] = WebpLookupTables.Clip1[q1 - a3 + 255];
+            p[offsetMinus2Step] = WebpLookupTables.Clip1(p1 + a3);
+            p[offset - step] = WebpLookupTables.Clip1(p0 + a2);
+            p[offset] = WebpLookupTables.Clip1(q0 - a1);
+            p[offset + step] = WebpLookupTables.Clip1(q1 - a3);
         }
 
         private static void DoFilter6(Span<byte> p, int offset, int step)
@@ -971,18 +971,18 @@ private static void DoFilter6(Span<byte> p, int offset, int step)
             int q0 = p[offset];
             int q1 = p[offset + step];
             int q2 = p[offset + step2];
-            int a = WebpLookupTables.Sclip1[(3 * (q0 - p0)) + WebpLookupTables.Sclip1[p1 - q1 + 1020] + 1020];
+            int a = WebpLookupTables.Sclip1((3 * (q0 - p0)) + WebpLookupTables.Sclip1(p1 - q1));
 
             // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
             int a1 = ((27 * a) + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
             int a2 = ((18 * a) + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
             int a3 = ((9 * a) + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
-            p[offset - step3] = WebpLookupTables.Clip1[p2 + a3 + 255];
-            p[offset - step2] = WebpLookupTables.Clip1[p1 + a2 + 255];
-            p[offsetMinusStep] = WebpLookupTables.Clip1[p0 + a1 + 255];
-            p[offset] = WebpLookupTables.Clip1[q0 - a1 + 255];
-            p[offset + step] = WebpLookupTables.Clip1[q1 - a2 + 255];
-            p[offset + step2] = WebpLookupTables.Clip1[q2 - a3 + 255];
+            p[offset - step3] = WebpLookupTables.Clip1(p2 + a3);
+            p[offset - step2] = WebpLookupTables.Clip1(p1 + a2);
+            p[offsetMinusStep] = WebpLookupTables.Clip1(p0 + a1);
+            p[offset] = WebpLookupTables.Clip1(q0 - a1);
+            p[offset + step] = WebpLookupTables.Clip1(q1 - a2);
+            p[offset + step2] = WebpLookupTables.Clip1(q2 - a3);
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -992,7 +992,7 @@ private static bool NeedsFilter(Span<byte> p, int offset, int step, int t)
             int p0 = p[offset - step];
             int q0 = p[offset];
             int q1 = p[offset + step];
-            return (4 * WebpLookupTables.Abs0[p0 - q0 + 255]) + WebpLookupTables.Abs0[p1 - q1 + 255] <= t;
+            return (4 * WebpLookupTables.Abs0(p0 - q0)) + WebpLookupTables.Abs0(p1 - q1) <= t;
         }
 
         private static bool NeedsFilter2(Span<byte> p, int offset, int step, int t, int it)
@@ -1007,14 +1007,14 @@ private static bool NeedsFilter2(Span<byte> p, int offset, int step, int t, int
             int q1 = p[offset + step];
             int q2 = p[offset + step2];
             int q3 = p[offset + step3];
-            if ((4 * WebpLookupTables.Abs0[p0 - q0 + 255]) + WebpLookupTables.Abs0[p1 - q1 + 255] > t)
+            if ((4 * WebpLookupTables.Abs0(p0 - q0)) + WebpLookupTables.Abs0(p1 - q1) > t)
             {
                 return false;
             }
 
-            return WebpLookupTables.Abs0[p3 - p2 + 255] <= it && WebpLookupTables.Abs0[p2 - p1 + 255] <= it &&
-                   WebpLookupTables.Abs0[p1 - p0 + 255] <= it && WebpLookupTables.Abs0[q3 - q2 + 255] <= it &&
-                   WebpLookupTables.Abs0[q2 - q1 + 255] <= it && WebpLookupTables.Abs0[q1 - q0 + 255] <= it;
+            return WebpLookupTables.Abs0(p3 - p2) <= it && WebpLookupTables.Abs0(p2 - p1) <= it &&
+                   WebpLookupTables.Abs0(p1 - p0) <= it && WebpLookupTables.Abs0(q3 - q2) <= it &&
+                   WebpLookupTables.Abs0(q2 - q1) <= it && WebpLookupTables.Abs0(q1 - q0) <= it;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -1024,7 +1024,7 @@ private static bool Hev(Span<byte> p, int offset, int step, int thresh)
             int p0 = p[offset - step];
             int q0 = p[offset];
             int q1 = p[offset + step];
-            return WebpLookupTables.Abs0[p1 - p0 + 255] > thresh || WebpLookupTables.Abs0[q1 - q0 + 255] > thresh;
+            return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
index 98cf3029fa..3b5d677293 100644
--- a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
+++ b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
@@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Runtime.CompilerServices;
 
 namespace SixLabors.ImageSharp.Formats.Webp
 {
@@ -45,215 +46,17 @@ internal static class WebpLookupTables
             8 + (0 * WebpConstants.Bps), 12 + (0 * WebpConstants.Bps), 8 + (4 * WebpConstants.Bps), 12 + (4 * WebpConstants.Bps) // V
         };
 
-        public static readonly byte[] Abs0 =
-        {
-            0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0, 0xef,
-            0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8, 0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde,
-            0xdd, 0xdc, 0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0, 0xcf, 0xce, 0xcd,
-            0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc,
-            0xbb, 0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab,
-            0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a,
-            0x99, 0x98, 0x97, 0x96, 0x95, 0x94, 0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89,
-            0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c, 0x7b, 0x7a, 0x79, 0x78,
-            0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70, 0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67,
-            0x66, 0x65, 0x64, 0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58, 0x57, 0x56,
-            0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45,
-            0x44, 0x43, 0x42, 0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
-            0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23,
-            0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12,
-            0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
-            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
-            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21,
-            0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
-            0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43,
-            0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54,
-            0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65,
-            0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
-            0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-            0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-            0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
-            0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
-            0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
-            0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc,
-            0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed,
-            0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe,
-            0xff
-        };
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static byte Abs0(int x) => Abs0Table[x + 255];
 
-        public static readonly sbyte[] Sclip1 =
-        {
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-            -128, -128, -128, -128, -128, -128, -128, -128, -128, -127, -126, -125, -124, -123, -122, -121, -120,
-            -119, -118, -117, -116, -115, -114, -113, -112, -111, -110, -109, -108, -107, -106, -105, -104, -103,
-            -102, -101, -100, -99, -98, -97, -96, -95, -94, -93, -92, -91, -90, -89, -88, -87, -86, -85, -84, -83,
-            -82, -81, -80, -79, -78, -77, -76, -75, -74, -73, -72, -71, -70, -69, -68, -67, -66, -65, -64, -63, -62,
-            -61, -60, -59, -58, -57, -56, -55, -54, -53, -52, -51, -50, -49, -48, -47, -46, -45, -44, -43, -42, -41,
-            -40, -39, -38, -37, -36, -35, -34, -33, -32, -31, -30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20,
-            -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5,
-            6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
-            59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
-            85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
-            109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
-            127, 127, 127, 127, 127, 127, 127, 127, 127
-        };
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static sbyte Sclip1(int x) => Sclip1Table[x + 1020];
 
-        public static readonly sbyte[] Sclip2 =
-        {
-            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
-            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
-            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
-            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
-            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -15, -14, -13, -12, -11, -10, -9, -8,
-            -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
-            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
-        };
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static sbyte Sclip2(int x) => Sclip2Table[x + 112];
 
-        public static readonly byte[] Clip1 =
-        {
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
-            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21,
-            0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
-            0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43,
-            0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54,
-            0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65,
-            0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
-            0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-            0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-            0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
-            0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
-            0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
-            0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc,
-            0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed,
-            0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff
-        };
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static byte Clip1(int x) => Clip1Table[x + 255];
 
         // fixed costs for coding levels, deduce from the coding tree.
         // This is only the part that doesn't depend on the probability state.
@@ -1438,6 +1241,216 @@ static WebpLookupTables()
             InitializeFixedCostsI4();
         }
 
+        private static readonly byte[] Abs0Table =
+        {
+            0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0, 0xef,
+            0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8, 0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde,
+            0xdd, 0xdc, 0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0, 0xcf, 0xce, 0xcd,
+            0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc,
+            0xbb, 0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab,
+            0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a,
+            0x99, 0x98, 0x97, 0x96, 0x95, 0x94, 0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89,
+            0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c, 0x7b, 0x7a, 0x79, 0x78,
+            0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70, 0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67,
+            0x66, 0x65, 0x64, 0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58, 0x57, 0x56,
+            0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45,
+            0x44, 0x43, 0x42, 0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
+            0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23,
+            0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12,
+            0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21,
+            0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+            0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43,
+            0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54,
+            0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65,
+            0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+            0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+            0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+            0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+            0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
+            0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
+            0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc,
+            0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed,
+            0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe,
+            0xff
+        };
+
+        private static readonly byte[] Clip1Table =
+        {
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21,
+            0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+            0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43,
+            0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54,
+            0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65,
+            0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+            0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+            0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+            0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+            0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
+            0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
+            0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc,
+            0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed,
+            0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff
+        };
+
+        private static readonly sbyte[] Sclip1Table =
+        {
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, -128, -128, -128, -127, -126, -125, -124, -123, -122, -121, -120,
+            -119, -118, -117, -116, -115, -114, -113, -112, -111, -110, -109, -108, -107, -106, -105, -104, -103,
+            -102, -101, -100, -99, -98, -97, -96, -95, -94, -93, -92, -91, -90, -89, -88, -87, -86, -85, -84, -83,
+            -82, -81, -80, -79, -78, -77, -76, -75, -74, -73, -72, -71, -70, -69, -68, -67, -66, -65, -64, -63, -62,
+            -61, -60, -59, -58, -57, -56, -55, -54, -53, -52, -51, -50, -49, -48, -47, -46, -45, -44, -43, -42, -41,
+            -40, -39, -38, -37, -36, -35, -34, -33, -32, -31, -30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20,
+            -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5,
+            6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+            59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+            85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+            109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+            127, 127, 127, 127, 127, 127, 127, 127, 127
+        };
+
+        private static readonly sbyte[] Sclip2Table =
+        {
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -15, -14, -13, -12, -11, -10, -9, -8,
+            -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
+        };
+
         private static void InitializeModesProbabilities()
         {
             // Paragraph 11.5

From 853b1173697c0f56084eea21fd7d04f40764fa96 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 1 Nov 2021 19:46:24 +0100
Subject: [PATCH 20/85] Make histo and best histo array readonly

---
 src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index 6a0a3184ed..da815a479a 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -24,9 +24,9 @@ internal class Vp8LEncoder : IDisposable
         /// </summary>
         private readonly int[] scratch = new int[256];
 
-        private int[][] histoArgb = { new int[256], new int[256], new int[256], new int[256] };
+        private readonly int[][] histoArgb = { new int[256], new int[256], new int[256], new int[256] };
 
-        private int[][] bestHisto = { new int[256], new int[256], new int[256], new int[256] };
+        private readonly int[][] bestHisto = { new int[256], new int[256], new int[256], new int[256] };
 
         /// <summary>
         /// The <see cref="MemoryAllocator"/> to use for buffer allocations.

From 35d2afa0bb4be7e50d26d5ae5435dbcaa6ece4c9 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 1 Nov 2021 20:18:21 +0100
Subject: [PATCH 21/85] Add sse2 version of select

---
 .../Formats/Webp/Lossless/LosslessUtils.cs    | 60 +++++++++++++++----
 .../Formats/Webp/Lossless/PredictorEncoder.cs | 27 +++++----
 2 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index b7f94415be..7e21517d20 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -27,6 +27,10 @@ internal static unsafe class LosslessUtils
 
         private const double Log2Reciprocal = 1.44269504088896338700465094007086;
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<byte> Zero = Vector128.Create(0).AsByte();
+#endif
+
         /// <summary>
         /// Returns the exact index where array1 and array2 are different. For an index
         /// inferior or equal to bestLenMatch, the return value just has to be strictly
@@ -551,6 +555,7 @@ public static void PredictorInverseTransform(
                     int mask = tileWidth - 1;
                     int tilesPerRow = SubSampleSize(width, transform.Bits);
                     int predictorModeIdxBase = (y >> transform.Bits) * tilesPerRow;
+                    Span<short> scratch = stackalloc short[8];
                     while (y < yEnd)
                     {
                         int predictorModeIdx = predictorModeIdxBase;
@@ -608,7 +613,7 @@ public static void PredictorInverseTransform(
                                     PredictorAdd10(input + x, output + x - width, xEnd - x, output + x);
                                     break;
                                 case 11:
-                                    PredictorAdd11(input + x, output + x - width, xEnd - x, output + x);
+                                    PredictorAdd11(input + x, output + x - width, xEnd - x, output + x, scratch);
                                     break;
                                 case 12:
                                     PredictorAdd12(input + x, output + x - width, xEnd - x, output + x);
@@ -974,11 +979,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels,
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output)
+        private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output, Span<short> scratch)
         {
             for (int x = 0; x < numberOfPixels; x++)
             {
-                uint pred = Predictor11(output[x - 1], upper + x);
+                uint pred = Predictor11(output[x - 1], upper + x, scratch);
                 output[x] = AddPixels(input[x], pred);
             }
         }
@@ -1031,7 +1036,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels,
         public static uint Predictor10(uint left, uint* top) => Average4(left, top[-1], top[0], top[1]);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static uint Predictor11(uint left, uint* top) => Select(top[0], left, top[-1]);
+        public static uint Predictor11(uint left, uint* top, Span<short> scratch) => Select(top[0], left, top[-1], scratch);
 
         [MethodImpl(InliningOptions.ShortMethod)]
         public static uint Predictor12(uint left, uint* top) => ClampedAddSubtractFull(left, top[0], top[-1]);
@@ -1148,11 +1153,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint*
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output)
+        public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output, Span<short> scratch)
         {
             for (int x = 0; x < numPixels; x++)
             {
-                uint pred = Predictor11(input[x - 1], upper + x);
+                uint pred = Predictor11(input[x - 1], upper + x, scratch);
                 output[x] = SubPixels(input[x], pred);
             }
         }
@@ -1240,14 +1245,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
         private static Vector128<int> MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff));
 #endif
 
-        private static uint Select(uint a, uint b, uint c)
+        private static uint Select(uint a, uint b, uint c, Span<short> scratch)
         {
-            int paMinusPb =
-                Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
-                Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
-                Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
-                Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
-            return paMinusPb <= 0 ? a : b;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+                Span<short> output = scratch;
+                fixed (short* p = output)
+                {
+                    Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
+                    Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
+                    Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
+                    Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
+                    Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
+                    Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
+                    Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
+                    Vector128<byte> ac = Sse2.Or(ac0, ca0);
+                    Vector128<byte> bc = Sse2.Or(bc0, cb0);
+                    Vector128<byte> pa = Sse2.UnpackLow(ac, Zero); // |a - c|
+                    Vector128<byte> pb = Sse2.UnpackLow(bc, Zero); // |b - c|
+                    Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
+                    Sse2.Store((ushort*)p, diff);
+                }
+
+                int paMinusPb = output[0] + output[1] + output[2] + output[3];
+
+                return (paMinusPb <= 0) ? a : b;
+            }
+            else
+#endif
+            {
+                int paMinusPb =
+                    Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
+                    Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
+                    Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
+                    Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
+                return paMinusPb <= 0 ? a : b;
+            }
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
index 671e9a043e..2c70faa0d8 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
@@ -50,6 +50,7 @@ public static void ResidualImage(
             int tilesPerRow = LosslessUtils.SubSampleSize(width, bits);
             int tilesPerCol = LosslessUtils.SubSampleSize(height, bits);
             int maxQuantization = 1 << LosslessUtils.NearLosslessBits(nearLosslessQuality);
+            Span<short> scratch = stackalloc short[8];
 
             // TODO: Can we optimize this?
             int[][] histo = new int[4][];
@@ -84,7 +85,8 @@ public static void ResidualImage(
                             transparentColorMode,
                             usedSubtractGreen,
                             nearLossless,
-                            image);
+                            image,
+                            scratch);
 
                         image[(tileY * tilesPerRow) + tileX] = (uint)(WebpConstants.ArgbBlack | (pred << 8));
                     }
@@ -192,7 +194,8 @@ private static int GetBestPredictorForTile(
             WebpTransparentColorMode transparentColorMode,
             bool usedSubtractGreen,
             bool nearLossless,
-            Span<uint> modes)
+            Span<uint> modes,
+            Span<short> scratch)
         {
             const int numPredModes = 14;
             int startX = tileX << bits;
@@ -272,7 +275,7 @@ private static int GetBestPredictorForTile(
                         }
                     }
 
-                    GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals);
+                    GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals, scratch);
                     for (int relativeX = 0; relativeX < maxX; ++relativeX)
                     {
                         UpdateHisto(histoArgb, residuals[relativeX]);
@@ -333,11 +336,12 @@ private static void GetResidual(
             WebpTransparentColorMode transparentColorMode,
             bool usedSubtractGreen,
             bool nearLossless,
-            Span<uint> output)
+            Span<uint> output,
+            Span<short> scratch)
         {
             if (transparentColorMode == WebpTransparentColorMode.Preserve)
             {
-                PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output);
+                PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output, scratch);
             }
             else
             {
@@ -395,7 +399,7 @@ private static void GetResidual(
                                     predict = LosslessUtils.Predictor10(currentRow[x - 1], upperRow + x);
                                     break;
                                 case 11:
-                                    predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x);
+                                    predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x, scratch);
                                     break;
                                 case 12:
                                     predict = LosslessUtils.Predictor12(currentRow[x - 1], upperRow + x);
@@ -583,6 +587,7 @@ private static void CopyImageWithPrediction(
             Span<byte> currentMaxDiffs = MemoryMarshal.Cast<uint, byte>(currentRow.Slice(width + 1));
 
             Span<byte> lowerMaxDiffs = currentMaxDiffs.Slice(width);
+            Span<short> scratch = stackalloc short[8];
             for (int y = 0; y < height; y++)
             {
                 Span<uint> tmp32 = upperRow;
@@ -593,7 +598,7 @@ private static void CopyImageWithPrediction(
 
                 if (lowEffort)
                 {
-                    PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width));
+                    PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width), scratch);
                 }
                 else
                 {
@@ -634,7 +639,8 @@ private static void CopyImageWithPrediction(
                             transparentColorMode,
                             usedSubtractGreen,
                             nearLossless,
-                            argb.Slice((y * width) + x));
+                            argb.Slice((y * width) + x),
+                            scratch);
 
                         x = xEnd;
                     }
@@ -649,7 +655,8 @@ private static void PredictBatch(
             int numPixels,
             Span<uint> currentSpan,
             Span<uint> upperSpan,
-            Span<uint> outputSpan)
+            Span<uint> outputSpan,
+            Span<short> scratch)
         {
 #pragma warning disable SA1503 // Braces should not be omitted
             fixed (uint* current = currentSpan)
@@ -718,7 +725,7 @@ private static void PredictBatch(
                             LosslessUtils.PredictorSub10(current + xStart, upper + xStart, numPixels, output);
                             break;
                         case 11:
-                            LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output);
+                            LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output, scratch);
                             break;
                         case 12:
                             LosslessUtils.PredictorSub12(current + xStart, upper + xStart, numPixels, output);

From de6bd9de7953d693b6e1a04007b2796507f65e0f Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 1 Nov 2021 21:29:10 +0100
Subject: [PATCH 22/85] Use Vector128<byte>.Zero

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 7e21517d20..22c2333607 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -27,10 +27,6 @@ internal static unsafe class LosslessUtils
 
         private const double Log2Reciprocal = 1.44269504088896338700465094007086;
 
-#if SUPPORTS_RUNTIME_INTRINSICS
-        private static readonly Vector128<byte> Zero = Vector128.Create(0).AsByte();
-#endif
-
         /// <summary>
         /// Returns the exact index where array1 and array2 are different. For an index
         /// inferior or equal to bestLenMatch, the return value just has to be strictly
@@ -1262,8 +1258,8 @@ private static uint Select(uint a, uint b, uint c, Span<short> scratch)
                     Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
                     Vector128<byte> ac = Sse2.Or(ac0, ca0);
                     Vector128<byte> bc = Sse2.Or(bc0, cb0);
-                    Vector128<byte> pa = Sse2.UnpackLow(ac, Zero); // |a - c|
-                    Vector128<byte> pb = Sse2.UnpackLow(bc, Zero); // |b - c|
+                    Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
+                    Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
                     Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
                     Sse2.Store((ushort*)p, diff);
                 }

From 143de220b75abd8bf44f7943650a36cbaa3f7421 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 10:55:49 +0100
Subject: [PATCH 23/85] Add Predictor11 test

---
 .../Formats/WebP/LosslessUtilsTests.cs        | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
index be7bc27d3a..bf381ebdaa 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
@@ -132,6 +132,30 @@ private static void RunTransformColorInverseTest()
             Assert.Equal(expectedOutput, pixelData);
         }
 
+        private static void RunPredictor11Test()
+        {
+            // arrange
+            uint[] topData = { 4278258949, 4278258949 };
+            uint left = 4294839812;
+            short[] scratch = new short[8];
+            uint expectedResult = 4294839812;
+
+            // act
+            unsafe
+            {
+                fixed (uint* top = &topData[1])
+                {
+                    uint actual = LosslessUtils.Predictor11(left, top, scratch);
+
+                    // assert
+                    Assert.Equal(expectedResult, actual);
+                }
+            }
+        }
+
+        [Fact]
+        public void Predictor11_Works() => RunPredictor11Test();
+
         [Fact]
         public void SubtractGreen_Works() => RunSubtractGreenTest();
 
@@ -145,6 +169,12 @@ private static void RunTransformColorInverseTest()
         public void TransformColorInverse_Works() => RunTransformColorInverseTest();
 
 #if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void Predictor11_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.DisableSSE2);
+
         [Fact]
         public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll);
 

From fd07436736d721bedfbafc308d902aa1e7765778 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 12:40:04 +0100
Subject: [PATCH 24/85] Replace Guard with DebugGuard in FastSLog2Slow

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 22c2333607..ebebe79547 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -780,7 +780,7 @@ public static void ColorCodeToMultipliers(uint colorCode, ref Vp8LMultipliers m)
 
         private static float FastSLog2Slow(uint v)
         {
-            Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v));
+            DebugGuard.MustBeGreaterThanOrEqualTo<uint>(v, LogLookupIdxMax, nameof(v));
             if (v < ApproxLogWithCorrectionMax)
             {
                 int logCnt = 0;

From 2bf16bcb58556d6f3cbee5298472db42af60bd02 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 12:41:43 +0100
Subject: [PATCH 25/85] Reverse access to output array to remove bounds checks

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index ebebe79547..b278b12bc9 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1262,11 +1262,9 @@ private static uint Select(uint a, uint b, uint c, Span<short> scratch)
                     Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
                     Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
                     Sse2.Store((ushort*)p, diff);
+                    int paMinusPb = output[3] + output[2] + output[1] + output[0];
+                    return (paMinusPb <= 0) ? a : b;
                 }
-
-                int paMinusPb = output[0] + output[1] + output[2] + output[3];
-
-                return (paMinusPb <= 0) ? a : b;
             }
             else
 #endif

From a7ed1884e0f9439c03d913f4d4a5f2b36d38071e Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 14:15:13 +0100
Subject: [PATCH 26/85] Add sse2 version of ClampedAddSubtractHalf

---
 .../Formats/Webp/Lossless/LosslessUtils.cs    | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index b278b12bc9..0dda5a79a6 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1219,12 +1219,32 @@ private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
 
         private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
         {
-            uint ave = Average2(c0, c1);
-            int a = AddSubtractComponentHalf((int)(ave >> 24), (int)(c2 >> 24));
-            int r = AddSubtractComponentHalf((int)((ave >> 16) & 0xff), (int)((c2 >> 16) & 0xff));
-            int g = AddSubtractComponentHalf((int)((ave >> 8) & 0xff), (int)((c2 >> 8) & 0xff));
-            int b = AddSubtractComponentHalf((int)(ave & 0xff), (int)(c2 & 0xff));
-            return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+                Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
+                Vector128<short> avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16());
+                Vector128<short> a0 = Sse2.ShiftRightLogical(avg, 1);
+                Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
+                Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
+                Vector128<short> a2 = Sse2.Subtract(a1, bgta);
+                Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2.AsInt16(), 1);
+                Vector128<short> a4 = Sse2.Add(a0.AsInt16(), a3).AsInt16();
+                Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
+                uint output = Sse2.ConvertToUInt32(a5.AsUInt32());
+                return output;
+            }
+#endif
+            {
+                uint ave = Average2(c0, c1);
+                int a = AddSubtractComponentHalf((int)(ave >> 24), (int)(c2 >> 24));
+                int r = AddSubtractComponentHalf((int)((ave >> 16) & 0xff), (int)((c2 >> 16) & 0xff));
+                int g = AddSubtractComponentHalf((int)((ave >> 8) & 0xff), (int)((c2 >> 8) & 0xff));
+                int b = AddSubtractComponentHalf((int)(ave & 0xff), (int)(c2 & 0xff));
+                return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+            }
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]

From 28053739a9beeed006fd256a0ea8016631660841 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 14:20:33 +0100
Subject: [PATCH 27/85] Add sse2 version of ClampedAddSubtractFull

---
 .../Formats/Webp/Lossless/LosslessUtils.cs    | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 0dda5a79a6..7740dc0515 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1201,20 +1201,34 @@ public static uint AddPixels(uint a, uint b)
 
         private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
         {
-            int a = AddSubtractComponentFull(
-                (int)(c0 >> 24),
-                (int)(c1 >> 24),
-                (int)(c2 >> 24));
-            int r = AddSubtractComponentFull(
-                (int)((c0 >> 16) & 0xff),
-                (int)((c1 >> 16) & 0xff),
-                (int)((c2 >> 16) & 0xff));
-            int g = AddSubtractComponentFull(
-                (int)((c0 >> 8) & 0xff),
-                (int)((c1 >> 8) & 0xff),
-                (int)((c2 >> 8) & 0xff));
-            int b = AddSubtractComponentFull((int)(c0 & 0xff), (int)(c1 & 0xff), (int)(c2 & 0xff));
-            return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+                Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> v1 = Sse2.Add(c0Vec, c1Vec);
+                Vector128<byte> v2 = Sse2.Subtract(v1, c2Vec);
+                Vector128<byte> b = Sse2.PackUnsignedSaturate(v2.AsInt16(), v2.AsInt16());
+                uint output = Sse2.ConvertToUInt32(b.AsUInt32());
+            }
+#endif
+            {
+                int a = AddSubtractComponentFull(
+                    (int)(c0 >> 24),
+                    (int)(c1 >> 24),
+                    (int)(c2 >> 24));
+                int r = AddSubtractComponentFull(
+                    (int)((c0 >> 16) & 0xff),
+                    (int)((c1 >> 16) & 0xff),
+                    (int)((c2 >> 16) & 0xff));
+                int g = AddSubtractComponentFull(
+                    (int)((c0 >> 8) & 0xff),
+                    (int)((c1 >> 8) & 0xff),
+                    (int)((c2 >> 8) & 0xff));
+                int b = AddSubtractComponentFull((int)(c0 & 0xff), (int)(c1 & 0xff), (int)(c2 & 0xff));
+                return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+            }
         }
 
         private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)

From f6dbc7dd8ee95115315805dab2b9b38684e505b2 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 14:40:59 +0100
Subject: [PATCH 28/85] Fix issue in ClampedAddSubtractFull

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 7740dc0515..65b39bd2d7 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1207,10 +1207,11 @@ private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
                 Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
                 Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
                 Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-                Vector128<byte> v1 = Sse2.Add(c0Vec, c1Vec);
-                Vector128<byte> v2 = Sse2.Subtract(v1, c2Vec);
+                Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
+                Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
                 Vector128<byte> b = Sse2.PackUnsignedSaturate(v2.AsInt16(), v2.AsInt16());
                 uint output = Sse2.ConvertToUInt32(b.AsUInt32());
+                return output;
             }
 #endif
             {

From 8fe280e9918e14ca2abb7ffd21ae35c969429447 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 16:04:29 +0100
Subject: [PATCH 29/85] Add predictor 12 and 13 tests

---
 .../Formats/WebP/LosslessUtilsTests.cs        | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
index bf381ebdaa..c70f332ef6 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
@@ -153,9 +153,55 @@ private static void RunPredictor11Test()
             }
         }
 
+        private static void RunPredictor12Test()
+        {
+            // arrange
+            uint[] topData = { 4294844413, 4294779388 };
+            uint left = 4294844413;
+            uint expectedResult = 4294779388;
+
+            // act
+            unsafe
+            {
+                fixed (uint* top = &topData[1])
+                {
+                    uint actual = LosslessUtils.Predictor12(left, top);
+
+                    // assert
+                    Assert.Equal(expectedResult, actual);
+                }
+            }
+        }
+
+        private static void RunPredictor13Test()
+        {
+            // arrange
+            uint[] topData = { 4278193922, 4278193666 };
+            uint left = 4278193410;
+            uint expectedResult = 4278193154;
+
+            // act
+            unsafe
+            {
+                fixed (uint* top = &topData[1])
+                {
+                    uint actual = LosslessUtils.Predictor13(left, top);
+
+                    // assert
+                    Assert.Equal(expectedResult, actual);
+                }
+            }
+        }
+
         [Fact]
         public void Predictor11_Works() => RunPredictor11Test();
 
+        [Fact]
+        public void Predictor12_Works() => RunPredictor12Test();
+
+        [Fact]
+        public void Predictor13_Works() => RunPredictor13Test();
+
         [Fact]
         public void SubtractGreen_Works() => RunSubtractGreenTest();
 
@@ -175,6 +221,18 @@ private static void RunPredictor11Test()
         [Fact]
         public void Predictor11_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.DisableSSE2);
 
+        [Fact]
+        public void Predictor12_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor12Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void Predictor12_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor12Test, HwIntrinsics.DisableSSE2);
+
+        [Fact]
+        public void Predictor13_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor13Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void Predictor13_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor13Test, HwIntrinsics.DisableSSE2);
+
         [Fact]
         public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll);
 

From ffdf99bad2d8f4fb9d52a3938f3c64d750f09957 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 16:29:52 +0100
Subject: [PATCH 30/85] Add aggressive inlining

---
 src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs    | 8 ++++++++
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 1 +
 2 files changed, 9 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs
index 8596d85558..02bbc38fcf 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs
@@ -1,6 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System.Runtime.CompilerServices;
+
 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
     /// <summary>
@@ -41,6 +43,7 @@ public void Init(int hashBits)
         /// Inserts a new color into the cache.
         /// </summary>
         /// <param name="bgra">The color to insert.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public void Insert(uint bgra)
         {
             int key = HashPix(bgra, this.HashShift);
@@ -52,6 +55,7 @@ public void Insert(uint bgra)
         /// </summary>
         /// <param name="key">The key to lookup.</param>
         /// <returns>The color for the key.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public uint Lookup(int key) => this.Colors[key];
 
         /// <summary>
@@ -59,6 +63,7 @@ public void Insert(uint bgra)
         /// </summary>
         /// <param name="bgra">The color to check.</param>
         /// <returns>The index of the color in the cache or -1 if its not present.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public int Contains(uint bgra)
         {
             int key = HashPix(bgra, this.HashShift);
@@ -70,6 +75,7 @@ public int Contains(uint bgra)
         /// </summary>
         /// <param name="bgra">The color.</param>
         /// <returns>The index for the color.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public int GetIndex(uint bgra) => HashPix(bgra, this.HashShift);
 
         /// <summary>
@@ -77,8 +83,10 @@ public int Contains(uint bgra)
         /// </summary>
         /// <param name="key">The key.</param>
         /// <param name="bgra">The color to add.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public void Set(uint key, uint bgra) => this.Colors[key] = bgra;
 
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static int HashPix(uint argb, int shift) => (int)((argb * HashMul) >> shift);
     }
 }
diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 65b39bd2d7..9baa6c3c33 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -752,6 +752,7 @@ public static byte TransformColorBlue(sbyte greenToBlue, sbyte redToBlue, uint a
         /// <summary>
         /// Fast calculation of log2(v) for integer input.
         /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static float FastLog2(uint v) => v < LogLookupIdxMax ? WebpLookupTables.Log2Table[v] : FastLog2Slow(v);
 
         /// <summary>

From fc8d8b81d98201955655595fe682a0c5533eb6ea Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 21:56:19 +0100
Subject: [PATCH 31/85] Remove unnecessary cast AsInt16()

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 9baa6c3c33..8bd3163ccb 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1210,7 +1210,7 @@ private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
                 Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
                 Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
                 Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
-                Vector128<byte> b = Sse2.PackUnsignedSaturate(v2.AsInt16(), v2.AsInt16());
+                Vector128<byte> b = Sse2.PackUnsignedSaturate(v2, v2);
                 uint output = Sse2.ConvertToUInt32(b.AsUInt32());
                 return output;
             }

From 1e4352b8a1a2468d8a34297c1650c3e7b8e19fb7 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Wed, 3 Nov 2021 10:25:02 +0100
Subject: [PATCH 32/85] Remove unnecessary SetEndOfStream, we already have read
 all bytes from the stream BitReaderBase

---
 .../Formats/Webp/BitReader/Vp8LBitReader.cs          | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs b/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs
index 07423e3127..4df2feba81 100644
--- a/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs
+++ b/src/ImageSharp/Formats/Webp/BitReader/Vp8LBitReader.cs
@@ -138,7 +138,6 @@ public uint ReadValue(int nBits)
                 return (uint)val;
             }
 
-            this.SetEndOfStream();
             return 0;
         }
 
@@ -203,17 +202,6 @@ private void ShiftBytes()
                 ++this.pos;
                 this.bitPos -= 8;
             }
-
-            if (this.IsEndOfStream())
-            {
-                this.SetEndOfStream();
-            }
-        }
-
-        private void SetEndOfStream()
-        {
-            this.Eos = true;
-            this.bitPos = 0; // To avoid undefined behaviour with shifts.
         }
     }
 }

From 47794dfbcb192ec8c610a5e21d03da8b279ef5e1 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Wed, 3 Nov 2021 10:36:29 +0100
Subject: [PATCH 33/85] Change Guard to DebugGuard in ReadValue

---
 src/ImageSharp/Formats/Webp/BitReader/Vp8BitReader.cs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/BitReader/Vp8BitReader.cs b/src/ImageSharp/Formats/Webp/BitReader/Vp8BitReader.cs
index abf44127a9..d6ceca5bf5 100644
--- a/src/ImageSharp/Formats/Webp/BitReader/Vp8BitReader.cs
+++ b/src/ImageSharp/Formats/Webp/BitReader/Vp8BitReader.cs
@@ -142,10 +142,11 @@ public int GetSigned(int v)
         [MethodImpl(InliningOptions.ShortMethod)]
         public bool ReadBool() => this.ReadValue(1) is 1;
 
+        [MethodImpl(InliningOptions.ShortMethod)]
         public uint ReadValue(int nBits)
         {
-            Guard.MustBeGreaterThan(nBits, 0, nameof(nBits));
-            Guard.MustBeLessThanOrEqualTo(nBits, 32, nameof(nBits));
+            DebugGuard.MustBeGreaterThan(nBits, 0, nameof(nBits));
+            DebugGuard.MustBeLessThanOrEqualTo(nBits, 32, nameof(nBits));
 
             uint v = 0;
             while (nBits-- > 0)
@@ -156,10 +157,11 @@ public uint ReadValue(int nBits)
             return v;
         }
 
+        [MethodImpl(InliningOptions.ShortMethod)]
         public int ReadSignedValue(int nBits)
         {
-            Guard.MustBeGreaterThan(nBits, 0, nameof(nBits));
-            Guard.MustBeLessThanOrEqualTo(nBits, 32, nameof(nBits));
+            DebugGuard.MustBeGreaterThan(nBits, 0, nameof(nBits));
+            DebugGuard.MustBeLessThanOrEqualTo(nBits, 32, nameof(nBits));
 
             int value = (int)this.ReadValue(nBits);
             return this.ReadValue(1) != 0 ? -value : value;

From f9212f7adca384b1147af10a38e3ec0d8dcc12d2 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 3 Nov 2021 22:38:52 +1100
Subject: [PATCH 34/85] Update
 tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs

Co-authored-by: Anton Firszov <antonfir@gmail.com>
---
 tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
index d3f3cf126e..af35d1f895 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
@@ -92,7 +92,7 @@ public void Bgr24()
             [Fact]
             public void TPixel()
             {
-                var source = new RgbaVector(1, .1F, .133F, .864F);
+                var source = new RgbaVector(float.Epsilon, 2 * float.Epsilon, float.MaxValue, float.MinValue);
 
                 // Act:
                 var color = Color.FromPixel(source);

From 4598b1461801d1893c61e66ae75d34d1249c4bf3 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Wed, 3 Nov 2021 13:00:05 +0100
Subject: [PATCH 35/85] Use ReadOnlySpan<byte> for byte and sbyte arrays

---
 .../Formats/Webp/WebpLookupTables.cs          | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
index 3b5d677293..bf47b01bca 100644
--- a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
+++ b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
@@ -253,7 +253,8 @@ internal static class WebpLookupTables
             0
         };
 
-        public static readonly byte[] NewRange =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        public static ReadOnlySpan<byte> NewRange => new byte[]
         {
             // range = ((range + 1) << kVP8Log2Range[range]) - 1
             127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
@@ -571,7 +572,8 @@ internal static class WebpLookupTables
         };
 
         // Paragraph 14.1
-        public static readonly byte[] DcTable =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        public static ReadOnlySpan<byte> DcTable => new byte[]
         {
             4,     5,   6,   7,   8,   9,  10,  10,
             11,   12,  13,  14,  15,  16,  17,  17,
@@ -1046,7 +1048,8 @@ public static readonly (int Code, int ExtraBits)[] PrefixEncodeCode =
             (17, 7), (17, 7), (17, 7), (17, 7), (17, 7), (17, 7), (17, 7), (17, 7),
         };
 
-        public static readonly byte[] PrefixEncodeExtraBitsValue =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        public static ReadOnlySpan<byte> PrefixEncodeExtraBitsValue => new byte[]
         {
            0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
            0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
@@ -1241,7 +1244,8 @@ static WebpLookupTables()
             InitializeFixedCostsI4();
         }
 
-        private static readonly byte[] Abs0Table =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<byte> Abs0Table => new byte[]
         {
             0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0, 0xef,
             0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8, 0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde,
@@ -1276,7 +1280,8 @@ static WebpLookupTables()
             0xff
         };
 
-        private static readonly byte[] Clip1Table =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<byte> Clip1Table => new byte[]
         {
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -1326,7 +1331,8 @@ static WebpLookupTables()
             0xff
         };
 
-        private static readonly sbyte[] Sclip1Table =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<sbyte> Sclip1Table => new sbyte[]
         {
             -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
             -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
@@ -1437,7 +1443,8 @@ static WebpLookupTables()
             127, 127, 127, 127, 127, 127, 127, 127, 127
         };
 
-        private static readonly sbyte[] Sclip2Table =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<sbyte> Sclip2Table => new sbyte[]
         {
             -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
             -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,

From 425600459e96cc5d34857fd9e0de45952fa8e6ae Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 3 Nov 2021 23:49:32 +1100
Subject: [PATCH 36/85] Update Color.Equals

---
 src/ImageSharp/Color/Color.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index fe66efcfb5..61d6c8e6d5 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -277,7 +277,7 @@ public bool Equals(Color other)
                 return this.data.PackedValue == other.data.PackedValue;
             }
 
-            return this.ToVector4().Equals(other.ToVector4());
+            return this.boxedHighPrecisionPixel?.Equals(other.boxedHighPrecisionPixel) == true;
         }
 
         /// <inheritdoc />

From 08785103e350266f626b3519b22e3966b4450caa Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 12:39:42 +0100
Subject: [PATCH 37/85] Add EntropyPasses default value explicit to 1

---
 src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs | 1 +
 src/ImageSharp/Formats/Webp/WebpEncoder.cs         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs b/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
index 7dbf49d45e..000de4f88c 100644
--- a/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
+++ b/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
@@ -35,6 +35,7 @@ internal interface IWebpEncoderOptions
 
         /// <summary>
         /// Gets the number of entropy-analysis passes (in [1..10]).
+        /// Defaults to 1.
         /// </summary>
         int EntropyPasses { get; }
 
diff --git a/src/ImageSharp/Formats/Webp/WebpEncoder.cs b/src/ImageSharp/Formats/Webp/WebpEncoder.cs
index f85f65b635..bdcbb194b1 100644
--- a/src/ImageSharp/Formats/Webp/WebpEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/WebpEncoder.cs
@@ -27,7 +27,7 @@ public sealed class WebpEncoder : IImageEncoder, IWebpEncoderOptions
         public bool UseAlphaCompression { get; set; }
 
         /// <inheritdoc/>
-        public int EntropyPasses { get; set; }
+        public int EntropyPasses { get; set; } = 1;
 
         /// <inheritdoc/>
         public int SpatialNoiseShaping { get; set; } = 50;

From 947dc8d5ecff64414247ede191452cf8c7a77c26 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 12:40:39 +0100
Subject: [PATCH 38/85] Make sure magick.net and imagesharp use the same
 configuration

---
 .../Codecs/EncodeWebp.cs                      | 45 ++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
index 7d3dfe693c..59814f465c 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
@@ -4,6 +4,7 @@
 using System.IO;
 using BenchmarkDotNet.Attributes;
 using ImageMagick;
+using ImageMagick.Formats;
 using SixLabors.ImageSharp.Formats.Webp;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Tests;
@@ -44,8 +45,22 @@ public void Cleanup()
         public void MagickWebpLossy()
         {
             using var memoryStream = new MemoryStream();
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "lossless", false);
-            this.webpMagick.Write(memoryStream, MagickFormat.WebP);
+
+            var defines = new WebPWriteDefines
+            {
+                Lossless = false,
+                Method = 4,
+                AlphaCompression = WebPAlphaCompression.None,
+                FilterStrength = 60,
+                SnsStrength = 50,
+                Pass = 1,
+
+                // 100 means off.
+                NearLossless = 100
+            };
+
+            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Write(memoryStream, defines);
         }
 
         [Benchmark(Description = "ImageSharp Webp Lossy")]
@@ -54,7 +69,12 @@ public void ImageSharpWebpLossy()
             using var memoryStream = new MemoryStream();
             this.webp.Save(memoryStream, new WebpEncoder()
             {
-                FileFormat = WebpFileFormatType.Lossy
+                FileFormat = WebpFileFormatType.Lossy,
+                Method = WebpEncodingMethod.Level4,
+                UseAlphaCompression = false,
+                FilterStrength = 60,
+                SpatialNoiseShaping = 50,
+                EntropyPasses = 1
             });
         }
 
@@ -62,8 +82,18 @@ public void ImageSharpWebpLossy()
         public void MagickWebpLossless()
         {
             using var memoryStream = new MemoryStream();
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "lossless", true);
-            this.webpMagick.Write(memoryStream, MagickFormat.WebP);
+            var defines = new WebPWriteDefines
+            {
+                Lossless = true,
+                Method = 4,
+
+                // 100 means off.
+                NearLossless = 100
+            };
+
+            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "exact", false);
+            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Write(memoryStream, defines);
         }
 
         [Benchmark(Description = "ImageSharp Webp Lossless")]
@@ -72,7 +102,10 @@ public void ImageSharpWebpLossless()
             using var memoryStream = new MemoryStream();
             this.webp.Save(memoryStream, new WebpEncoder()
             {
-                FileFormat = WebpFileFormatType.Lossless
+                FileFormat = WebpFileFormatType.Lossless,
+                Method = WebpEncodingMethod.Level4,
+                NearLossless = false,
+                TransparentColorMode = WebpTransparentColorMode.Clear
             });
         }
 

From 55b67ada2f659463f438303e77d0f1b1de4c47bc Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 21:40:02 +0100
Subject: [PATCH 39/85] Use webpMagick.Quality for the quality parameter

---
 tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
index 59814f465c..2229849921 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
@@ -59,7 +59,7 @@ public void MagickWebpLossy()
                 NearLossless = 100
             };
 
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Quality = 75;
             this.webpMagick.Write(memoryStream, defines);
         }
 
@@ -91,8 +91,7 @@ public void MagickWebpLossless()
                 NearLossless = 100
             };
 
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "exact", false);
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Quality = 75;
             this.webpMagick.Write(memoryStream, defines);
         }
 
@@ -105,6 +104,8 @@ public void ImageSharpWebpLossless()
                 FileFormat = WebpFileFormatType.Lossless,
                 Method = WebpEncodingMethod.Level4,
                 NearLossless = false,
+
+                // This is equal to exact = false in libwebp, which is the default.
                 TransparentColorMode = WebpTransparentColorMode.Clear
             });
         }

From d6d952e477b0653b2750210ad4cd2d3fc14bbaec Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 23:12:01 +0100
Subject: [PATCH 40/85] Remove another unnecessary cast AsInt16()

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 8bd3163ccb..ee9ea51237 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1246,8 +1246,8 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
                 Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
                 Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
                 Vector128<short> a2 = Sse2.Subtract(a1, bgta);
-                Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2.AsInt16(), 1);
-                Vector128<short> a4 = Sse2.Add(a0.AsInt16(), a3).AsInt16();
+                Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2, 1);
+                Vector128<short> a4 = Sse2.Add(a0, a3).AsInt16();
                 Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
                 uint output = Sse2.ConvertToUInt32(a5.AsUInt32());
                 return output;

From e97c364b373ffcc8bf11295ee9597bff3af7b927 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 5 Nov 2021 12:40:26 +0100
Subject: [PATCH 41/85] Use AsSpan() parameters to slice

---
 src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
index 6320983bab..3c81f1a22c 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
@@ -203,10 +203,10 @@ public static void GenerateOptimalTree(HuffmanTree[] tree, uint[] histogram, int
 
                 // Build the Huffman tree.
 #if NET5_0_OR_GREATER
-                Span<HuffmanTree> treeSlice = tree.AsSpan().Slice(0, treeSize);
+                Span<HuffmanTree> treeSlice = tree.AsSpan(0, treeSize);
                 treeSlice.Sort(HuffmanTree.Compare);
 #else
-                HuffmanTree[] treeCopy = tree.AsSpan().Slice(0, treeSize).ToArray();
+                HuffmanTree[] treeCopy = tree.AsSpan(0, treeSize).ToArray();
                 Array.Sort(treeCopy, HuffmanTree.Compare);
                 treeCopy.AsSpan().CopyTo(tree);
 #endif

From 2b6dbbce6fb6561a7fbddb0bd08afe69b9349382 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 5 Nov 2021 12:46:53 +0100
Subject: [PATCH 42/85] Update benchmark results

---
 .../Codecs/DecodeWebp.cs                      | 49 ++++++++---------
 .../Codecs/EncodeWebp.cs                      | 55 +++++++++----------
 2 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs
index 407a4ef3b2..878929823d 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs
@@ -76,34 +76,29 @@ public int WebpLossless()
             return image.Height;
         }
 
-        /* Results 17.06.2021
-         *  BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
+        /* Results 04.11.2021
+         *  BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19043.1320 (21H1/May2021Update)
             Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
-            .NET Core SDK=3.1.202
-              [Host]     : .NET Core 3.1.4 (CoreCLR 4.700.20.20201, CoreFX 4.700.20.22101), X64 RyuJIT
-              Job-AQFZAV : .NET Framework 4.8 (4.8.4180.0), X64 RyuJIT
-              Job-YCDAPQ : .NET Core 2.1.18 (CoreCLR 4.6.28801.04, CoreFX 4.6.28802.05), X64 RyuJIT
-              Job-WMTYOZ : .NET Core 3.1.4 (CoreCLR 4.700.20.20201, CoreFX 4.700.20.22101), X64 RyuJIT
-
-            IterationCount=3  LaunchCount=1  WarmupCount=3
-            |                     Method |        Job |       Runtime |        TestImageLossy |        TestImageLossless |       Mean |     Error |   StdDev |     Gen 0 |     Gen 1 | Gen 2 |   Allocated |
-            |--------------------------- |----------- |-------------- |---------------------- |------------------------- |-----------:|----------:|---------:|----------:|----------:|------:|------------:|
-            |        'Magick Lossy Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.8 ms |   6.28 ms |  0.34 ms |         - |         - |     - |    17.65 KB |
-            |    'ImageSharp Lossy Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp | 1,145.0 ms | 110.82 ms |  6.07 ms |         - |         - |     - |  2779.53 KB |
-            |     'Magick Lossless Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   145.9 ms |   8.55 ms |  0.47 ms |         - |         - |     - |    18.05 KB |
-            | 'ImageSharp Lossless Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp | 1,694.1 ms |  55.09 ms |  3.02 ms | 4000.0000 | 1000.0000 |     - | 30556.87 KB |
-            |        'Magick Lossy Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.7 ms |   1.89 ms |  0.10 ms |         - |         - |     - |    15.75 KB |
-            |    'ImageSharp Lossy Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   741.6 ms |  21.45 ms |  1.18 ms |         - |         - |     - |  2767.85 KB |
-            |     'Magick Lossless Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   146.1 ms |   9.52 ms |  0.52 ms |         - |         - |     - |    16.54 KB |
-            | 'ImageSharp Lossless Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   522.5 ms |  21.15 ms |  1.16 ms | 4000.0000 | 1000.0000 |     - | 22860.02 KB |
-            |        'Magick Lossy Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.9 ms |   5.34 ms |  0.29 ms |         - |         - |     - |    15.45 KB |
-            |    'ImageSharp Lossy Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   748.8 ms | 290.47 ms | 15.92 ms |         - |         - |     - |  2767.84 KB |
-            |     'Magick Lossless Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   146.1 ms |   1.14 ms |  0.06 ms |         - |         - |     - |     15.9 KB |
-            | 'ImageSharp Lossless Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   480.7 ms |  25.25 ms |  1.38 ms | 4000.0000 | 1000.0000 |     - |  22859.7 KB |
-            |        'Magick Lossy Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.7 ms |   4.71 ms |  0.26 ms |         - |         - |     - |    15.48 KB |
-            |    'ImageSharp Lossy Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   645.7 ms |  61.00 ms |  3.34 ms |         - |         - |     - |  2768.13 KB |
-            |     'Magick Lossless Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   146.5 ms |  18.63 ms |  1.02 ms |         - |         - |     - |     15.8 KB |
-            | 'ImageSharp Lossless Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   306.7 ms |  32.31 ms |  1.77 ms | 4000.0000 | 1000.0000 |     - | 22860.02 KB |
+            .NET SDK=6.0.100-rc.2.21505.57
+              [Host]     : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+              Job-WQLXJO : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+              Job-OJJAMD : .NET Core 3.1.20 (CoreCLR 4.700.21.47003, CoreFX 4.700.21.47101), X64 RyuJIT
+              Job-OMFOAS : .NET Framework 4.8 (4.8.4420.0), X64 RyuJIT
+
+            |                     Method |        Job |              Runtime |             Arguments |        TestImageLossy |        TestImageLossless |       Mean |     Error |  StdDev |    Gen 0 | Gen 1 | Gen 2 | Allocated |
+            |--------------------------- |----------- |--------------------- |---------------------- |---------------------- |------------------------- |-----------:|----------:|--------:|---------:|------:|------:|----------:|
+            |        'Magick Lossy Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   107.9 ms |  28.91 ms | 1.58 ms |        - |     - |     - |     25 KB |
+            |    'ImageSharp Lossy Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   282.3 ms |  25.40 ms | 1.39 ms | 500.0000 |     - |     - |  2,428 KB |
+            |     'Magick Lossless Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.3 ms |  11.99 ms | 0.66 ms |        - |     - |     - |     16 KB |
+            | 'ImageSharp Lossless Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   280.2 ms |   6.21 ms | 0.34 ms |        - |     - |     - |  2,092 KB |
+            |        'Magick Lossy Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.2 ms |   9.32 ms | 0.51 ms |        - |     - |     - |     15 KB |
+            |    'ImageSharp Lossy Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   295.8 ms |  21.25 ms | 1.16 ms | 500.0000 |     - |     - |  2,427 KB |
+            |     'Magick Lossless Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.5 ms |   4.07 ms | 0.22 ms |        - |     - |     - |     15 KB |
+            | 'ImageSharp Lossless Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   464.0 ms |  55.70 ms | 3.05 ms |        - |     - |     - |  2,090 KB |
+            |        'Magick Lossy Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   108.0 ms |  29.60 ms | 1.62 ms |        - |     - |     - |     32 KB |
+            |    'ImageSharp Lossy Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   564.9 ms |  29.69 ms | 1.63 ms |        - |     - |     - |  2,436 KB |
+            |     'Magick Lossless Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.2 ms |   4.74 ms | 0.26 ms |        - |     - |     - |     18 KB |
+            | 'ImageSharp Lossless Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp | 1,767.5 ms | 106.33 ms | 5.83 ms |        - |     - |     - |  9,729 KB |
          */
     }
 }
diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
index 2229849921..43d8c464ce 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
@@ -110,37 +110,34 @@ public void ImageSharpWebpLossless()
             });
         }
 
-        /* Results 17.06.2021
+        /* Results 04.11.2021
          * Summary *
-        BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.630 (2004/?/20H1)
+        BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19043.1320 (21H1/May2021Update)
         Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
-        .NET Core SDK=5.0.100
-          [Host]     : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
-          Job-OUUGWL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
-          Job-GAIITM : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
-          Job-HWOBSO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
-
-        |                     Method |        Job |       Runtime |    TestImage |      Mean |      Error |    StdDev | Ratio | RatioSD |      Gen 0 |     Gen 1 |     Gen 2 |    Allocated |
-        |--------------------------- |----------- |-------------- |------------- |----------:|-----------:|----------:|------:|--------:|-----------:|----------:|----------:|-------------:|
-        |        'Magick Webp Lossy' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png |  23.30 ms |   0.869 ms |  0.048 ms |  0.14 |    0.00 |          - |         - |         - |     68.19 KB |
-        |    'ImageSharp Webp Lossy' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png |  68.22 ms |  16.454 ms |  0.902 ms |  0.42 |    0.01 |  6125.0000 |  125.0000 |         - |  26359.49 KB |
-        |     'Magick Webp Lossless' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png | 161.96 ms |   9.879 ms |  0.541 ms |  1.00 |    0.00 |          - |         - |         - |    520.28 KB |
-        | 'ImageSharp Webp Lossless' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png | 370.88 ms |  58.875 ms |  3.227 ms |  2.29 |    0.02 | 34000.0000 | 5000.0000 | 2000.0000 | 163177.15 KB |
-        |                            |            |               |              |           |            |           |       |         |            |           |           |              |
-        |        'Magick Webp Lossy' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png |  23.35 ms |   0.428 ms |  0.023 ms |  0.14 |    0.00 |          - |         - |         - |     67.76 KB |
-        |    'ImageSharp Webp Lossy' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png |  43.95 ms |   2.850 ms |  0.156 ms |  0.27 |    0.00 |  6250.0000 |  250.0000 |   83.3333 |  26284.72 KB |
-        |     'Magick Webp Lossless' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png | 161.44 ms |   3.749 ms |  0.206 ms |  1.00 |    0.00 |          - |         - |         - |    519.26 KB |
-        | 'ImageSharp Webp Lossless' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png | 335.78 ms |  78.666 ms |  4.312 ms |  2.08 |    0.03 | 34000.0000 | 5000.0000 | 2000.0000 | 162727.56 KB |
-        |                            |            |               |              |           |            |           |       |         |            |           |           |              |
-        |        'Magick Webp Lossy' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png |  23.48 ms |   4.325 ms |  0.237 ms |  0.15 |    0.00 |          - |         - |         - |     67.66 KB |
-        |    'ImageSharp Webp Lossy' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png |  43.29 ms |  16.503 ms |  0.905 ms |  0.27 |    0.01 |  6272.7273 |  272.7273 |   90.9091 |  26284.86 KB |
-        |     'Magick Webp Lossless' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png | 161.81 ms |  10.693 ms |  0.586 ms |  1.00 |    0.00 |          - |         - |         - |    523.25 KB |
-        | 'ImageSharp Webp Lossless' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png | 323.97 ms | 235.468 ms | 12.907 ms |  2.00 |    0.08 | 34000.0000 | 5000.0000 | 2000.0000 | 162724.84 KB |
-        |                            |            |               |              |           |            |           |       |         |            |           |           |              |
-        |        'Magick Webp Lossy' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png |  23.36 ms |   0.448 ms |  0.025 ms |  0.14 |    0.00 |          - |         - |         - |     67.66 KB |
-        |    'ImageSharp Webp Lossy' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png |  40.11 ms |   2.465 ms |  0.135 ms |  0.25 |    0.00 |  6307.6923 |  230.7692 |   76.9231 |  26284.71 KB |
-        |     'Magick Webp Lossless' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png | 161.55 ms |   6.662 ms |  0.365 ms |  1.00 |    0.00 |          - |         - |         - |    518.84 KB |
-        | 'ImageSharp Webp Lossless' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png | 298.73 ms |  17.953 ms |  0.984 ms |  1.85 |    0.01 | 34000.0000 | 5000.0000 | 2000.0000 | 162725.13 KB |
+        .NET SDK=6.0.100-rc.2.21505.57
+          [Host]     : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+          Job-WQLXJO : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+          Job-OJJAMD : .NET Core 3.1.20 (CoreCLR 4.700.21.47003, CoreFX 4.700.21.47101), X64 RyuJIT
+          Job-OMFOAS : .NET Framework 4.8 (4.8.4420.0), X64 RyuJIT
+
+        IterationCount=3  LaunchCount=1  WarmupCount=3
+
+        |                     Method |        Job |              Runtime |             Arguments |    TestImage |      Mean |     Error |   StdDev | Ratio | RatioSD |       Gen 0 |     Gen 1 |     Gen 2 |  Allocated |
+        |--------------------------- |----------- |--------------------- |---------------------- |------------- |----------:|----------:|---------:|------:|--------:|------------:|----------:|----------:|-----------:|
+        |        'Magick Webp Lossy' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png |  23.33 ms |  1.491 ms | 0.082 ms |  0.15 |    0.00 |           - |         - |         - |      67 KB |
+        |    'ImageSharp Webp Lossy' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png | 245.80 ms | 24.288 ms | 1.331 ms |  1.53 |    0.01 | 135000.0000 |         - |         - | 552,713 KB |
+        |     'Magick Webp Lossless' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png | 160.36 ms | 11.131 ms | 0.610 ms |  1.00 |    0.00 |           - |         - |         - |     518 KB |
+        | 'ImageSharp Webp Lossless' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png | 313.93 ms | 45.605 ms | 2.500 ms |  1.96 |    0.01 |  34000.0000 | 5000.0000 | 2000.0000 | 161,670 KB |
+        |                            |            |                      |                       |              |           |           |          |       |         |             |           |           |            |
+        |        'Magick Webp Lossy' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png |  23.36 ms |  2.289 ms | 0.125 ms |  0.15 |    0.00 |           - |         - |         - |      67 KB |
+        |    'ImageSharp Webp Lossy' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png | 254.64 ms | 19.620 ms | 1.075 ms |  1.59 |    0.00 | 135000.0000 |         - |         - | 552,713 KB |
+        |     'Magick Webp Lossless' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png | 160.30 ms |  9.549 ms | 0.523 ms |  1.00 |    0.00 |           - |         - |         - |     518 KB |
+        | 'ImageSharp Webp Lossless' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png | 320.35 ms | 22.924 ms | 1.257 ms |  2.00 |    0.01 |  34000.0000 | 5000.0000 | 2000.0000 | 161,669 KB |
+        |                            |            |                      |                       |              |           |           |          |       |         |             |           |           |            |
+        |        'Magick Webp Lossy' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png |  23.37 ms |  0.908 ms | 0.050 ms |  0.15 |    0.00 |           - |         - |         - |      68 KB |
+        |    'ImageSharp Webp Lossy' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png | 378.67 ms | 25.540 ms | 1.400 ms |  2.36 |    0.01 | 135000.0000 |         - |         - | 554,351 KB |
+        |     'Magick Webp Lossless' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png | 160.13 ms |  5.115 ms | 0.280 ms |  1.00 |    0.00 |           - |         - |         - |     520 KB |
+        | 'ImageSharp Webp Lossless' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png | 379.01 ms | 71.192 ms | 3.902 ms |  2.37 |    0.02 |  34000.0000 | 5000.0000 | 2000.0000 | 162,119 KB |
         */
     }
 }

From b9e8f76990206843b485006bac8b9ff2cceb05ed Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Sun, 7 Nov 2021 18:07:43 +1100
Subject: [PATCH 43/85] Update FromPixel

---
 src/ImageSharp/Color/Color.Conversions.cs | 11 +++++++++++
 src/ImageSharp/Color/Color.cs             | 22 +++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 424b7dcdfe..96aa05c961 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -23,6 +23,17 @@ public Color(Rgba64 pixel)
             this.boxedHighPrecisionPixel = null;
         }
 
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Color"/> struct.
+        /// </summary>
+        /// <param name="pixel">The <see cref="Rgb48"/> containing the color information.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public Color(Rgb48 pixel)
+        {
+            this.data = new Rgba64(pixel.R, pixel.G, pixel.B, ushort.MaxValue);
+            this.boxedHighPrecisionPixel = null;
+        }
+
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index 61d6c8e6d5..c461d034eb 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -107,7 +107,27 @@ private Color(IPixel pixel)
         [MethodImpl(InliningOptions.ShortMethod)]
         public static Color FromPixel<TPixel>(TPixel pixel)
             where TPixel : unmanaged, IPixel<TPixel>
-            => new(pixel);
+        {
+            // Avoid boxing in case we can convert to Rgba64 safely and efficently
+            if (typeof(TPixel) == typeof(Rgba64))
+            {
+                return new((Rgba64)(object)pixel);
+            }
+            else if (typeof(TPixel) == typeof(Rgb48))
+            {
+                return new((Rgb48)(object)pixel);
+            }
+            else if (Unsafe.SizeOf<TPixel>() <= Unsafe.SizeOf<Rgba32>())
+            {
+                Rgba32 p = default;
+                pixel.ToRgba32(ref p);
+                return new(p);
+            }
+            else
+            {
+                return new(pixel);
+            }
+        }
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct

From 5b1720eb8deccd3ea37248111a68df73ce632c3a Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 13:27:08 +0100
Subject: [PATCH 44/85] Add sse41 version of Hadamard transform

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 151 +++++++++++++++++-
 1 file changed, 146 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 04ff80b2d9..0993e2a666 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,11 +4,15 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal static class LossyUtils
+    internal static unsafe class LossyUtils
     {
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
@@ -61,11 +65,12 @@ public static void Copy(Span<byte> src, Span<byte> dst, int w, int h)
         public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w)
         {
             int d = 0;
+            int dataSize = (4 * WebpConstants.Bps) - 16;
             for (int y = 0; y < 16 * WebpConstants.Bps; y += 4 * WebpConstants.Bps)
             {
                 for (int x = 0; x < 16; x += 4)
                 {
-                    d += Vp8Disto4X4(a.Slice(x + y), b.Slice(x + y), w);
+                    d += Vp8Disto4X4(a.Slice(x + y, dataSize), b.Slice(x + y, dataSize), w);
                 }
             }
 
@@ -75,9 +80,19 @@ public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w)
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w)
         {
-            int sum1 = TTransform(a, w);
-            int sum2 = TTransform(b, w);
-            return Math.Abs(sum2 - sum1) >> 5;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                int diffSum = TTransformSse41(a, b, w);
+                return Math.Abs(diffSum) >> 5;
+            }
+            else
+#endif
+            {
+                int sum1 = TTransform(a, w);
+                int sum2 = TTransform(b, w);
+                return Math.Abs(sum2 - sum1) >> 5;
+            }
         }
 
         public static void DC16(Span<byte> dst, Span<byte> yuv, int offset)
@@ -591,6 +606,132 @@ public static int TTransform(Span<byte> input, Span<ushort> w)
             return sum;
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// Hadamard transform
+        /// Returns the weighted sum of the absolute value of transformed coefficients.
+        /// w[] contains a row-major 4 by 4 symmetric matrix.
+        /// </summary>
+        public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
+        {
+            Span<int> sum = stackalloc int[4];
+#pragma warning disable SA1503 // Braces should not be omitted
+            fixed (byte* inputAPtr = inputA)
+            fixed (byte* inputBPtr = inputB)
+            fixed (ushort* wPtr = w)
+            fixed (int* outputPtr = sum)
+            {
+                // Load and combine inputs.
+                Vector128<byte> ina0 = Sse2.LoadVector128(inputAPtr);
+                Vector128<byte> ina1 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 1));
+                Vector128<byte> ina2 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 2));
+                Vector128<long> ina3 = Sse2.LoadVector128((long*)(inputAPtr + (WebpConstants.Bps * 3)));
+                Vector128<byte> inb0 = Sse2.LoadVector128(inputBPtr);
+                Vector128<byte> inb1 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 1));
+                Vector128<byte> inb2 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 2));
+                Vector128<long> inb3 = Sse2.LoadVector128((long*)(inputBPtr + (WebpConstants.Bps * 3)));
+
+                // Combine inA and inB (we'll do two transforms in parallel).
+                Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
+                Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
+                Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
+                Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
+                Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
+                Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
+                Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
+                Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
+
+                // a00 a01 a02 a03   b00 b01 b02 b03
+                // a10 a11 a12 a13   b10 b11 b12 b13
+                // a20 a21 a22 a23   b20 b21 b22 b23
+                // a30 a31 a32 a33   b30 b31 b32 b33
+                // Vertical pass first to avoid a transpose (vertical and horizontal passes
+                // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+                // Calculate a and b (two 4x4 at once).
+                Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
+                Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
+                Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
+                Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
+                Vector128<short> b0 = Sse2.Add(a0, a1);
+                Vector128<short> b1 = Sse2.Add(a3, a2);
+                Vector128<short> b2 = Sse2.Subtract(a3, a2);
+                Vector128<short> b3 = Sse2.Subtract(a0, a1);
+
+                // a00 a01 a02 a03   b00 b01 b02 b03
+                // a10 a11 a12 a13   b10 b11 b12 b13
+                // a20 a21 a22 a23   b20 b21 b22 b23
+                // a30 a31 a32 a33   b30 b31 b32 b33
+                // Transpose the two 4x4.
+                Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
+                Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
+                Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
+                Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
+
+                // a00 a10 a01 a11   a02 a12 a03 a13
+                // a20 a30 a21 a31   a22 a32 a23 a33
+                // b00 b10 b01 b11   b02 b12 b03 b13
+                // b20 b30 b21 b31   b22 b32 b23 b33
+                Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
+                Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
+                Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
+                Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
+
+                // a00 a10 a20 a30 a01 a11 a21 a31
+                // b00 b10 b20 b30 b01 b11 b21 b31
+                // a02 a12 a22 a32 a03 a13 a23 a33
+                // b02 b12 a22 b32 b03 b13 b23 b33
+                Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
+                Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
+                Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
+                Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+
+                // a00 a10 a20 a30   b00 b10 b20 b30
+                // a01 a11 a21 a31   b01 b11 b21 b31
+                // a02 a12 a22 a32   b02 b12 b22 b32
+                // a03 a13 a23 a33   b03 b13 b23 b33
+                // Horizontal pass and difference of weighted sums.
+                Vector128<ushort> w0 = Sse2.LoadVector128(wPtr);
+                Vector128<ushort> w8 = Sse2.LoadVector128(wPtr + 8);
+
+                // Calculate a and b (two 4x4 at once).
+                a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
+                a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
+                a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
+                a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
+                b0 = Sse2.Add(a0, a1);
+                b1 = Sse2.Add(a3, a2);
+                b2 = Sse2.Subtract(a3, a2);
+                b3 = Sse2.Subtract(a0, a1);
+
+                // Separate the transforms of inA and inB.
+                Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
+                Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
+                Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
+                Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
+
+                Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
+                Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
+                Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
+                Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
+
+                // weighted sums.
+                Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
+                Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
+                Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
+                Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
+                Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
+                Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
+
+                // difference of weighted sums.
+                Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
+                Sse2.Store(outputPtr, result.AsInt32());
+            }
+
+            return sum[3] + sum[2] + sum[1] + sum[0];
+#pragma warning restore SA1503 // Braces should not be omitted
+        }
+#endif
+
         public static void TransformTwo(Span<short> src, Span<byte> dst)
         {
             TransformOne(src, dst);

From d2017933d7042d3757062cfe3134206652ce7b27 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 13:31:11 +0100
Subject: [PATCH 45/85] Add HadamardTransform sse tests

---
 .../Formats/WebP/LossyUtilsTests.cs           | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
new file mode 100644
index 0000000000..6a9a078d7c
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -0,0 +1,58 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using SixLabors.ImageSharp.Formats.Webp.Lossy;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    [Trait("Format", "Webp")]
+    public class LossyUtilsTests
+    {
+        private static void RunHadamardTransformTest()
+        {
+            byte[] a =
+            {
+                27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
+                129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
+                28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
+                26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
+                128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27
+            };
+
+            byte[] b =
+            {
+                28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+                28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28,
+                28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28
+            };
+
+            ushort[] w = { 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 };
+            int expected = 2;
+
+            int actual = LossyUtils.Vp8Disto4X4(a, b, w);
+            Assert.Equal(expected, actual);
+        }
+
+        [Fact]
+        public void HadamardTransform_Works() => RunHadamardTransformTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void HadamardTransform_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE2);
+
+        [Fact]
+        public void HadamardTransform_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41);
+
+        [Fact]
+        public void HadamardTransform_WithoutSSE2AndSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41 | HwIntrinsics.DisableSSE2);
+#endif
+
+    }
+}

From 3a03fad75eaa8464d1bd84cccd307014f9417497 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 14:51:51 +0100
Subject: [PATCH 46/85] Add sse41 version of quantize block

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 176 ++++++++++++++----
 1 file changed, 144 insertions(+), 32 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 2ed4381660..02087ceda4 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -3,13 +3,17 @@
 
 using System;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
     /// <summary>
     /// Quantization methods.
     /// </summary>
-    internal static class QuantEnc
+    internal static unsafe class QuantEnc
     {
         private static readonly byte[] Zigzag = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };
 
@@ -17,6 +21,18 @@ internal static class QuantEnc
 
         private const int MaxLevel = 2047;
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<short> MaxCoeff2047 = Vector128.Create((short)MaxLevel);
+
+        private static readonly Vector128<byte> CstLo = Vector128.Create(0, 1, 2, 3, 8, 9, 254, 255, 10, 11, 4, 5, 6, 7, 12, 13);
+
+        private static readonly Vector128<byte> Cst7 = Vector128.Create(254, 255, 254, 255, 254, 255, 254, 255, 14, 15, 254, 255, 254, 255, 254, 255);
+
+        private static readonly Vector128<byte> CstHi = Vector128.Create(2, 3, 8, 9, 10, 11, 4, 5, 254, 255, 6, 7, 12, 13, 14, 15);
+
+        private static readonly Vector128<byte> Cst8 = Vector128.Create(254, 255, 254, 255, 254, 255, 0, 1, 254, 255, 254, 255, 254, 255, 254, 255);
+#endif
+
         // Diffusion weights. We under-correct a bit (15/16th of the error is actually
         // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
         private const int C1 = 7;    // fraction of error sent to the 4x4 block below
@@ -486,51 +502,147 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Quantize2Blocks(Span<short> input, Span<short> output, Vp8Matrix mtx)
         {
-            int nz = QuantizeBlock(input, output, mtx) << 0;
-            nz |= QuantizeBlock(input.Slice(1 * 16), output.Slice(1 * 16), mtx) << 1;
+            int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), mtx) << 0;
+            nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), mtx) << 1;
             return nz;
         }
 
         public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix mtx)
         {
-            int last = -1;
-            int n;
-            for (n = 0; n < 16; ++n)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
             {
-                int j = Zigzag[n];
-                bool sign = input[j] < 0;
-                uint coeff = (uint)((sign ? -input[j] : input[j]) + mtx.Sharpen[j]);
-                if (coeff > mtx.ZThresh[j])
+#pragma warning disable SA1503 // Braces should not be omitted
+                fixed (ushort* mtxIqPtr = mtx.IQ)
+                fixed (ushort* mtxQPtr = mtx.Q)
+                fixed (uint* biasQPtr = mtx.Bias)
+                fixed (short* inputPtr = input)
+                fixed (short* outputPtr = output)
                 {
-                    uint q = mtx.Q[j];
-                    uint iQ = mtx.IQ[j];
-                    uint b = mtx.Bias[j];
-                    int level = QuantDiv(coeff, iQ, b);
-                    if (level > MaxLevel)
+                    // Load all inputs.
+                    Vector128<short> input0 = Sse2.LoadVector128(inputPtr);
+                    Vector128<short> input8 = Sse2.LoadVector128(inputPtr + 8);
+                    Vector128<ushort> iq0 = Sse2.LoadVector128(mtxIqPtr);
+                    Vector128<ushort> iq8 = Sse2.LoadVector128(mtxIqPtr + 8);
+                    Vector128<ushort> q0 = Sse2.LoadVector128(mtxQPtr);
+                    Vector128<ushort> q8 = Sse2.LoadVector128(mtxQPtr + 8);
+
+                    // coeff = abs(in)
+                    Vector128<ushort> coeff0 = Ssse3.Abs(input0);
+                    Vector128<ushort> coeff8 = Ssse3.Abs(input8);
+
+                    // out = (coeff * iQ + B) >> QFIX
+                    // doing calculations with 32b precision (QFIX=17)
+                    // out = (coeff * iQ)
+                    Vector128<ushort> coeffiQ0H = Sse2.MultiplyHigh(coeff0, iq0);
+                    Vector128<ushort> coeffiQ0L = Sse2.MultiplyLow(coeff0, iq0);
+                    Vector128<ushort> coeffiQ8H = Sse2.MultiplyHigh(coeff8, iq8);
+                    Vector128<ushort> coeffiQ8L = Sse2.MultiplyLow(coeff8, iq8);
+                    Vector128<ushort> out00 = Sse2.UnpackLow(coeffiQ0L, coeffiQ0H);
+                    Vector128<ushort> out04 = Sse2.UnpackHigh(coeffiQ0L, coeffiQ0H);
+                    Vector128<ushort> out08 = Sse2.UnpackLow(coeffiQ8L, coeffiQ8H);
+                    Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
+
+                    // out = (coeff * iQ + B)
+                    Vector128<uint> bias00 = Sse2.LoadVector128(biasQPtr);
+                    Vector128<uint> bias04 = Sse2.LoadVector128(biasQPtr + 4);
+                    Vector128<uint> bias08 = Sse2.LoadVector128(biasQPtr + 8);
+                    Vector128<uint> bias12 = Sse2.LoadVector128(biasQPtr + 12);
+                    out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
+                    out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
+                    out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
+                    out12 = Sse2.Add(out12.AsInt32(), bias12.AsInt32()).AsUInt16();
+
+                    // out = QUANTDIV(coeff, iQ, B, QFIX)
+                    out00 = Sse2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16();
+                    out04 = Sse2.ShiftRightArithmetic(out04.AsInt32(), WebpConstants.QFix).AsUInt16();
+                    out08 = Sse2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16();
+                    out12 = Sse2.ShiftRightArithmetic(out12.AsInt32(), WebpConstants.QFix).AsUInt16();
+
+                    // pack result as 16b
+                    Vector128<short> out0 = Sse2.PackSignedSaturate(out00.AsInt32(), out04.AsInt32());
+                    Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());
+
+                    // if (coeff > 2047) coeff = 2047
+                    out0 = Sse2.Min(out0, MaxCoeff2047);
+                    out8 = Sse2.Min(out8, MaxCoeff2047);
+
+                    // put sign back
+                    out0 = Ssse3.Sign(out0, input0);
+                    out8 = Ssse3.Sign(out8, input8);
+
+                    // in = out * Q
+                    input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
+                    input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
+
+                    // in = out * Q
+                    Sse2.Store(inputPtr, input0);
+                    Sse2.Store(inputPtr + 8, input8);
+
+                    // zigzag the output before storing it. The re-ordering is:
+                    //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+                    // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+                    // There's only two misplaced entries ([8] and [7]) that are crossing the
+                    // reg's boundaries.
+                    // We use pshufb instead of pshuflo/pshufhi.
+                    Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
+                    Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7);  // extract #7
+                    Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
+                    Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8);  // extract #8
+                    Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
+                    Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
+                    Sse2.Store(outputPtr, outZ0.AsInt16());
+                    Sse2.Store(outputPtr + 8, outZ8.AsInt16());
+                    Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
+
+                    // Detect if all 'out' values are zeroes or not.
+                    Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
+                    return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
+                }
+#pragma warning restore SA1503 // Braces should not be omitted
+            }
+            else
+#endif
+            {
+                int last = -1;
+                int n;
+                for (n = 0; n < 16; ++n)
+                {
+                    int j = Zigzag[n];
+                    bool sign = input[j] < 0;
+                    uint coeff = (uint)((sign ? -input[j] : input[j]) + mtx.Sharpen[j]);
+                    if (coeff > mtx.ZThresh[j])
                     {
-                        level = MaxLevel;
-                    }
+                        uint q = mtx.Q[j];
+                        uint iQ = mtx.IQ[j];
+                        uint b = mtx.Bias[j];
+                        int level = QuantDiv(coeff, iQ, b);
+                        if (level > MaxLevel)
+                        {
+                            level = MaxLevel;
+                        }
 
-                    if (sign)
-                    {
-                        level = -level;
-                    }
+                        if (sign)
+                        {
+                            level = -level;
+                        }
 
-                    input[j] = (short)(level * (int)q);
-                    output[n] = (short)level;
-                    if (level != 0)
+                        input[j] = (short)(level * (int)q);
+                        output[n] = (short)level;
+                        if (level != 0)
+                        {
+                            last = n;
+                        }
+                    }
+                    else
                     {
-                        last = n;
+                        output[n] = 0;
+                        input[j] = 0;
                     }
                 }
-                else
-                {
-                    output[n] = 0;
-                    input[j] = 0;
-                }
-            }
 
-            return last >= 0 ? 1 : 0;
+                return last >= 0 ? 1 : 0;
+            }
         }
 
         // Quantize as usual, but also compute and return the quantization error.

From 020134ad8c15e58621635d4ca4b5fb4c6acdbe89 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 14:52:11 +0100
Subject: [PATCH 47/85] Add QuantizeBlock sse tests

---
 .../Formats/Webp/Lossy/Vp8Matrix.cs           |  9 +++
 .../Formats/WebP/QuantEncTests.cs             | 56 +++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
index 4276b887f0..e525e388b8 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
@@ -34,6 +34,15 @@ public Vp8Matrix()
             this.Sharpen = new short[16];
         }
 
+        public Vp8Matrix(ushort[] q, ushort[] iq, uint[] bias, uint[] zThresh, short[] sharpen)
+        {
+            this.Q = q;
+            this.IQ = iq;
+            this.Bias = bias;
+            this.ZThresh = zThresh;
+            this.Sharpen = sharpen;
+        }
+
         /// <summary>
         /// Gets the quantizer steps.
         /// </summary>
diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
new file mode 100644
index 0000000000..280a7902ae
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -0,0 +1,56 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.Linq;
+using SixLabors.ImageSharp.Formats.Webp.Lossy;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    [Trait("Format", "Webp")]
+    public class QuantEncTests
+    {
+        private static void RunQuantizeBlockTest()
+        {
+            // arrange
+            short[] input = { 378, 777, -851, 888, 259, 148, 0, -111, -185, -185, -74, -37, 148, 74, 111, 74 };
+            short[] output = new short[16];
+            ushort[] q = { 42, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37 };
+            ushort[] iq = { 3120, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542 };
+            uint[] bias =
+            {
+                49152, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296,
+                55296, 55296
+            };
+            uint[] zthresh = { 26, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 };
+            short[] expectedOutput = { 9, 21, 7, -5, 4, -23, 24, 0, -5, 4, 2, -2, -3, -1, 3, 2 };
+            int expectedResult = 1;
+            var vp8Matrix = new Vp8Matrix(q, iq, bias, zthresh, new short[16]);
+
+            // act
+            int actualResult = QuantEnc.QuantizeBlock(input, output, vp8Matrix);
+
+            // assert
+            Assert.True(output.SequenceEqual(expectedOutput));
+            Assert.Equal(expectedResult, actualResult);
+        }
+
+        [Fact]
+        public void QuantizeBlock_Works() => RunQuantizeBlockTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void QuantizeBlock_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void QuantizeBlock_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2);
+
+        [Fact]
+        public void QuantizeBlock_WithoutSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSSE3);
+
+        [Fact]
+        public void QuantizeBlock_WithoutSSE2AndSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3);
+#endif
+    }
+}

From a628909b8da58e9dbd10bfa3b70e9c8ce66ddc1d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 15:02:08 +0100
Subject: [PATCH 48/85] Add coeff = abs(in) + sharpen

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 02087ceda4..b812909b20 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -516,6 +516,7 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
                 fixed (ushort* mtxIqPtr = mtx.IQ)
                 fixed (ushort* mtxQPtr = mtx.Q)
                 fixed (uint* biasQPtr = mtx.Bias)
+                fixed (short* sharpenPtr = mtx.Sharpen)
                 fixed (short* inputPtr = input)
                 fixed (short* outputPtr = output)
                 {
@@ -531,6 +532,12 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
                     Vector128<ushort> coeff0 = Ssse3.Abs(input0);
                     Vector128<ushort> coeff8 = Ssse3.Abs(input8);
 
+                    // coeff = abs(in) + sharpen
+                    Vector128<short> sharpen0 = Sse2.LoadVector128(sharpenPtr);
+                    Vector128<short> sharpen8 = Sse2.LoadVector128(sharpenPtr + 8);
+                    Sse2.Add(coeff0.AsInt16(), sharpen0);
+                    Sse2.Add(coeff8.AsInt16(), sharpen8);
+
                     // out = (coeff * iQ + B) >> QFIX
                     // doing calculations with 32b precision (QFIX=17)
                     // out = (coeff * iQ)

From af90336173a1ee20a6c894c113e5f799b139bf9f Mon Sep 17 00:00:00 2001
From: Anton Firszov <antonfir@gmail.com>
Date: Sun, 7 Nov 2021 15:25:47 +0100
Subject: [PATCH 49/85] stackalloc header buffer in InternalDetectFormat

---
 src/ImageSharp/Image.Decode.cs | 51 +++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/ImageSharp/Image.Decode.cs b/src/ImageSharp/Image.Decode.cs
index 94da2c9958..ee340bf86e 100644
--- a/src/ImageSharp/Image.Decode.cs
+++ b/src/ImageSharp/Image.Decode.cs
@@ -58,31 +58,42 @@ private static IImageFormat InternalDetectFormat(Stream stream, Configuration co
                 return null;
             }
 
-            using (IMemoryOwner<byte> buffer = config.MemoryAllocator.Allocate<byte>(headerSize, AllocationOptions.Clean))
+            // Header sizes are so small, that headersBuffer will be always stackalloc-ed in practice,
+            // and heap allocation will never happen, there is no need for the usual try-finally ArrayPool dance.
+            // The array case is only a safety mechanism following stackalloc best practices.
+            Span<byte> headersBuffer = headerSize > 512 ? new byte[headerSize] : stackalloc byte[headerSize];
+            long startPosition = stream.Position;
+
+            // Read doesn't always guarantee the full returned length so read a byte
+            // at a time until we get either our count or hit the end of the stream.
+            int n = 0;
+            int i;
+            do
             {
-                Span<byte> bufferSpan = buffer.GetSpan();
-                long startPosition = stream.Position;
+                i = stream.Read(headersBuffer, n, headerSize - n);
+                n += i;
+            }
+            while (n < headerSize && i > 0);
 
-                // Read doesn't always guarantee the full returned length so read a byte
-                // at a time until we get either our count or hit the end of the stream.
-                int n = 0;
-                int i;
-                do
+            stream.Position = startPosition;
+
+            // Does the given stream contain enough data to fit in the header for the format
+            // and does that data match the format specification?
+            // Individual formats should still check since they are public.
+            IImageFormat format = null;
+            foreach (IImageFormatDetector formatDetector in config.ImageFormatsManager.FormatDetectors)
+            {
+                if (formatDetector.HeaderSize <= headerSize)
                 {
-                    i = stream.Read(bufferSpan, n, headerSize - n);
-                    n += i;
+                    IImageFormat attemptFormat = formatDetector.DetectFormat(headersBuffer);
+                    if (attemptFormat != null)
+                    {
+                        format = attemptFormat;
+                    }
                 }
-                while (n < headerSize && i > 0);
-
-                stream.Position = startPosition;
-
-                // Does the given stream contain enough data to fit in the header for the format
-                // and does that data match the format specification?
-                // Individual formats should still check since they are public.
-                return config.ImageFormatsManager.FormatDetectors
-                    .Where(x => x.HeaderSize <= headerSize)
-                    .Select(x => x.DetectFormat(buffer.GetSpan())).LastOrDefault(x => x != null);
             }
+
+            return format;
         }
 
         /// <summary>

From 765f5a23138ce905056a2e7f69f4a3c0feaf4842 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 16:13:28 +0100
Subject: [PATCH 50/85] Add SSE2 version of Mean16x4

---
 .../Formats/Webp/Lossy/Vp8EncIterator.cs      | 73 ++++++++++++++++---
 1 file changed, 61 insertions(+), 12 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
index 79fd8d8543..489977cb82 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
@@ -2,6 +2,10 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
@@ -9,7 +13,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     /// Iterator structure to iterate through macroblocks, pointing to the
     /// right neighbouring data (samples, predictions, contexts, ...)
     /// </summary>
-    internal class Vp8EncIterator
+    internal unsafe class Vp8EncIterator
     {
         public const int YOffEnc = 0;
 
@@ -29,6 +33,10 @@ internal class Vp8EncIterator
 
         private readonly int mbh;
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create(0x00ff).AsByte();
+#endif
+
         /// <summary>
         /// Stride of the prediction plane(=4*mbw + 1).
         /// </summary>
@@ -357,12 +365,13 @@ public int FastMbAnalyze(int quality)
             int q = quality;
             int kThreshold = 8 + ((17 - 8) * q / 100);
             int k;
-            uint[] dc = new uint[16];
+            Span<uint> dc = stackalloc uint[16];
+            Span<ushort> tmp = stackalloc ushort[16];
             uint m;
             uint m2;
             for (k = 0; k < 16; k += 4)
             {
-                this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.AsSpan(k));
+                this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp);
             }
 
             for (m = 0, m2 = 0, k = 0; k < 16; ++k)
@@ -823,21 +832,61 @@ public void BytesToNz()
             this.Nz[this.nzIdx] = nz;
         }
 
-        private void Mean16x4(Span<byte> input, Span<uint> dc)
+        private void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
         {
-            for (int k = 0; k < 4; k++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
             {
-                uint avg = 0;
-                for (int y = 0; y < 4; y++)
+#pragma warning disable SA1503 // Braces should not be omitted
+                tmp.Clear();
+                fixed (byte* inputPtr = input)
+                fixed (ushort* tmpPtr = tmp)
                 {
-                    for (int x = 0; x < 4; x++)
+                    Vector128<byte> a0 = Sse2.LoadVector128(inputPtr);
+                    Vector128<byte> a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps);
+                    Vector128<byte> a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2));
+                    Vector128<byte> a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3));
+                    Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
+                    Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
+                    Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
+                    Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
+                    Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
+                    Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
+                    Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
+                    Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
+                    Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
+                    Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
+                    Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
+                    Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
+                    Vector128<int> e0 = Sse2.Add(d0, d1);
+                    Vector128<int> e1 = Sse2.Add(d2, d3);
+                    Vector128<int> f0 = Sse2.Add(e0, e1);
+                    Sse2.Store(tmpPtr, f0.AsUInt16());
+                }
+#pragma warning restore SA1503 // Braces should not be omitted
+
+                dc[0] = (uint)(tmp[1] + tmp[0]);
+                dc[1] = (uint)(tmp[3] + tmp[2]);
+                dc[2] = (uint)(tmp[5] + tmp[4]);
+                dc[3] = (uint)(tmp[7] + tmp[6]);
+            }
+            else
+#endif
+            {
+                for (int k = 0; k < 4; k++)
+                {
+                    uint avg = 0;
+                    for (int y = 0; y < 4; y++)
                     {
-                        avg += input[x + (y * WebpConstants.Bps)];
+                        for (int x = 0; x < 4; x++)
+                        {
+                            avg += input[x + (y * WebpConstants.Bps)];
+                        }
                     }
-                }
 
-                dc[k] = avg;
-                input = input.Slice(4);   // go to next 4x4 block.
+                    dc[k] = avg;
+                    input = input.Slice(4); // go to next 4x4 block.
+                }
             }
         }
 

From 8b8871b3ba75581ee2ff5f3fcb294bd640743136 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 16:39:42 +0100
Subject: [PATCH 51/85] Make Mean16x4 static and move to LossyUtils

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 68 +++++++++++++++++-
 .../Formats/Webp/Lossy/Vp8EncIterator.cs      | 72 +------------------
 2 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index d5db3dffa5..c3f6e522ac 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,12 +4,20 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal static class LossyUtils
+    internal static unsafe class LossyUtils
     {
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create(0x00ff).AsByte();
+#endif
+
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
 
@@ -801,6 +809,64 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride,
             FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh);
         }
 
+        public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+#pragma warning disable SA1503 // Braces should not be omitted
+                tmp.Clear();
+                fixed (byte* inputPtr = input)
+                fixed (ushort* tmpPtr = tmp)
+                {
+                    Vector128<byte> a0 = Sse2.LoadVector128(inputPtr);
+                    Vector128<byte> a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps);
+                    Vector128<byte> a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2));
+                    Vector128<byte> a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3));
+                    Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
+                    Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
+                    Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
+                    Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
+                    Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
+                    Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
+                    Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
+                    Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
+                    Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
+                    Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
+                    Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
+                    Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
+                    Vector128<int> e0 = Sse2.Add(d0, d1);
+                    Vector128<int> e1 = Sse2.Add(d2, d3);
+                    Vector128<int> f0 = Sse2.Add(e0, e1);
+                    Sse2.Store(tmpPtr, f0.AsUInt16());
+                }
+#pragma warning restore SA1503 // Braces should not be omitted
+
+                dc[0] = (uint)(tmp[1] + tmp[0]);
+                dc[1] = (uint)(tmp[3] + tmp[2]);
+                dc[2] = (uint)(tmp[5] + tmp[4]);
+                dc[3] = (uint)(tmp[7] + tmp[6]);
+            }
+            else
+#endif
+            {
+                for (int k = 0; k < 4; k++)
+                {
+                    uint avg = 0;
+                    for (int y = 0; y < 4; y++)
+                    {
+                        for (int x = 0; x < 4; x++)
+                        {
+                            avg += input[x + (y * WebpConstants.Bps)];
+                        }
+                    }
+
+                    dc[k] = avg;
+                    input = input.Slice(4); // go to next 4x4 block.
+                }
+            }
+        }
+
         [MethodImpl(InliningOptions.ShortMethod)]
         public static uint LoadUv(byte u, byte v) =>
             (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
index 489977cb82..57e18832ed 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
@@ -2,10 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
@@ -13,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     /// Iterator structure to iterate through macroblocks, pointing to the
     /// right neighbouring data (samples, predictions, contexts, ...)
     /// </summary>
-    internal unsafe class Vp8EncIterator
+    internal class Vp8EncIterator
     {
         public const int YOffEnc = 0;
 
@@ -33,10 +29,6 @@ internal unsafe class Vp8EncIterator
 
         private readonly int mbh;
 
-#if SUPPORTS_RUNTIME_INTRINSICS
-        private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create(0x00ff).AsByte();
-#endif
-
         /// <summary>
         /// Stride of the prediction plane(=4*mbw + 1).
         /// </summary>
@@ -371,10 +363,10 @@ public int FastMbAnalyze(int quality)
             uint m2;
             for (k = 0; k < 16; k += 4)
             {
-                this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp);
+                LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp);
             }
 
-            for (m = 0, m2 = 0, k = 0; k < 16; ++k)
+            for (m = 0, m2 = 0, k = 0; k < 16; k++)
             {
                 m += dc[k];
                 m2 += dc[k] * dc[k];
@@ -832,64 +824,6 @@ public void BytesToNz()
             this.Nz[this.nzIdx] = nz;
         }
 
-        private void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Sse2.IsSupported)
-            {
-#pragma warning disable SA1503 // Braces should not be omitted
-                tmp.Clear();
-                fixed (byte* inputPtr = input)
-                fixed (ushort* tmpPtr = tmp)
-                {
-                    Vector128<byte> a0 = Sse2.LoadVector128(inputPtr);
-                    Vector128<byte> a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps);
-                    Vector128<byte> a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2));
-                    Vector128<byte> a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3));
-                    Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
-                    Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
-                    Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
-                    Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
-                    Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
-                    Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
-                    Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
-                    Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
-                    Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
-                    Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
-                    Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
-                    Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
-                    Vector128<int> e0 = Sse2.Add(d0, d1);
-                    Vector128<int> e1 = Sse2.Add(d2, d3);
-                    Vector128<int> f0 = Sse2.Add(e0, e1);
-                    Sse2.Store(tmpPtr, f0.AsUInt16());
-                }
-#pragma warning restore SA1503 // Braces should not be omitted
-
-                dc[0] = (uint)(tmp[1] + tmp[0]);
-                dc[1] = (uint)(tmp[3] + tmp[2]);
-                dc[2] = (uint)(tmp[5] + tmp[4]);
-                dc[3] = (uint)(tmp[7] + tmp[6]);
-            }
-            else
-#endif
-            {
-                for (int k = 0; k < 4; k++)
-                {
-                    uint avg = 0;
-                    for (int y = 0; y < 4; y++)
-                    {
-                        for (int x = 0; x < 4; x++)
-                        {
-                            avg += input[x + (y * WebpConstants.Bps)];
-                        }
-                    }
-
-                    dc[k] = avg;
-                    input = input.Slice(4); // go to next 4x4 block.
-                }
-            }
-        }
-
         private void ImportBlock(Span<byte> src, int srcStride, Span<byte> dst, int w, int h, int size)
         {
             int dstIdx = 0;

From 984971e1d9aca406cfd41b742da96b2d8447fa1b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 16:48:10 +0100
Subject: [PATCH 52/85] Move yuv related methods to YuvConversion class

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 31 -------------------
 .../Formats/Webp/Lossy/WebpLossyDecoder.cs    | 24 +++++++-------
 .../Formats/Webp/Lossy/YuvConversion.cs       | 31 +++++++++++++++++++
 3 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index c3f6e522ac..b2513feb55 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -867,27 +867,6 @@ public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
             }
         }
 
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static uint LoadUv(byte u, byte v) =>
-            (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static void YuvToBgr(int y, int u, int v, Span<byte> bgr)
-        {
-            bgr[0] = (byte)YuvToB(y, u);
-            bgr[1] = (byte)YuvToG(y, u, v);
-            bgr[2] = (byte)YuvToR(y, v);
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);
-
         [MethodImpl(InliningOptions.ShortMethod)]
         public static byte Avg2(byte a, byte b) => (byte)((a + b + 1) >> 1);
 
@@ -1092,9 +1071,6 @@ private static bool Hev(Span<byte> p, int offset, int step, int thresh)
             return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh;
         }
 
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static int MultHi(int v, int coeff) => (v * coeff) >> 8;
-
         [MethodImpl(InliningOptions.ShortMethod)]
         private static void Store(Span<byte> dst, int x, int y, int v)
         {
@@ -1117,13 +1093,6 @@ private static void Store2(Span<byte> dst, int y, int dc, int d, int c)
         [MethodImpl(InliningOptions.ShortMethod)]
         private static int Mul2(int a) => (a * 35468) >> 16;
 
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static byte Clip8(int v)
-        {
-            int yuvMask = (256 << 6) - 1;
-            return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255);
-        }
-
         [MethodImpl(InliningOptions.ShortMethod)]
         private static void Put8x8uv(byte value, Span<byte> dst)
         {
diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
index 4f283f9f53..2f78842c63 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
@@ -747,21 +747,21 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
         {
             int xStep = 3;
             int lastPixelPair = (len - 1) >> 1;
-            uint tluv = LossyUtils.LoadUv(topU[0], topV[0]); // top-left sample
-            uint luv = LossyUtils.LoadUv(curU[0], curV[0]); // left-sample
+            uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample
+            uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample
             uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
-            LossyUtils.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
+            YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
 
             if (bottomY != null)
             {
                 uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
-                LossyUtils.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
+                YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
             }
 
             for (int x = 1; x <= lastPixelPair; x++)
             {
-                uint tuv = LossyUtils.LoadUv(topU[x], topV[x]); // top sample
-                uint uv = LossyUtils.LoadUv(curU[x], curV[x]); // sample
+                uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample
+                uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample
 
                 // Precompute invariant values associated with first and second diagonals.
                 uint avg = tluv + tuv + luv + uv + 0x00080008u;
@@ -770,15 +770,15 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
                 uv0 = (diag12 + tluv) >> 1;
                 uint uv1 = (diag03 + tuv) >> 1;
                 int xMul2 = x * 2;
-                LossyUtils.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
-                LossyUtils.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
+                YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
+                YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
 
                 if (bottomY != null)
                 {
                     uv0 = (diag03 + luv) >> 1;
                     uv1 = (diag12 + uv) >> 1;
-                    LossyUtils.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
-                    LossyUtils.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
+                    YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
+                    YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
                 }
 
                 tluv = tuv;
@@ -788,11 +788,11 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
             if ((len & 1) == 0)
             {
                 uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
-                LossyUtils.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
+                YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
                 if (bottomY != null)
                 {
                     uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
-                    LossyUtils.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
+                    YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
                 }
             }
         }
diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
index ed03c2e71d..24143785ab 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@@ -299,5 +299,36 @@ private static int ClipUv(int uv, int rounding)
             uv = (uv + rounding + (128 << (YuvFix + 2))) >> (YuvFix + 2);
             return (uv & ~0xff) == 0 ? uv : uv < 0 ? 0 : 255;
         }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static uint LoadUv(byte u, byte v) =>
+            (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static void YuvToBgr(int y, int u, int v, Span<byte> bgr)
+        {
+            bgr[0] = (byte)YuvToB(y, u);
+            bgr[1] = (byte)YuvToG(y, u, v);
+            bgr[2] = (byte)YuvToR(y, v);
+        }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static int MultHi(int v, int coeff) => (v * coeff) >> 8;
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static byte Clip8(int v)
+        {
+            int yuvMask = (256 << 6) - 1;
+            return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255);
+        }
     }
 }

From 0c96e37ba639d1d44b64840c41f01455a53eb9af Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 17:39:50 +0100
Subject: [PATCH 53/85] Add Mean16x4 sse tests

---
 .../Formats/Webp/Lossy/LossyUtils.cs          |  2 +-
 .../Formats/WebP/LossyUtilsTests.cs           | 49 +++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index b2513feb55..74448cf528 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     internal static unsafe class LossyUtils
     {
 #if SUPPORTS_RUNTIME_INTRINSICS
-        private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create(0x00ff).AsByte();
+        private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
 #endif
 
         [MethodImpl(InliningOptions.ShortMethod)]
diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
new file mode 100644
index 0000000000..5062f845ba
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -0,0 +1,49 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.Linq;
+using SixLabors.ImageSharp.Formats.Webp.Lossy;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    [Trait("Format", "Webp")]
+    public class LossyUtilsTests
+    {
+        private static void RunMean16x4Test()
+        {
+            // arrange
+            byte[] input =
+            {
+                154, 145, 102, 115, 127, 129, 126, 125, 126, 120, 133, 152, 157, 153, 119, 94, 104, 116, 111, 113,
+                113, 109, 105, 124, 173, 175, 177, 170, 175, 172, 166, 164, 151, 141, 99, 114, 125, 126, 135, 150,
+                133, 115, 127, 149, 141, 168, 100, 54, 110, 117, 115, 116, 119, 115, 117, 130, 174, 174, 174, 157,
+                146, 171, 166, 158, 117, 140, 96, 111, 119, 119, 136, 171, 188, 134, 121, 126, 136, 119, 59, 77,
+                109, 115, 113, 120, 120, 117, 128, 115, 174, 173, 173, 161, 152, 148, 153, 162, 105, 140, 96, 114,
+                115, 122, 141, 173, 190, 190, 142, 106, 151, 78, 66, 141, 110, 117, 123, 136, 118, 124, 127, 114,
+                173, 175, 166, 155, 155, 159, 159, 158
+            };
+            uint[] dc = new uint[4];
+            ushort[] tmp = new ushort[8];
+            uint[] expectedDc = { 1940, 2139, 2252, 1813 };
+
+            // act
+            LossyUtils.Mean16x4(input, dc, tmp);
+
+            // assert
+            Assert.True(dc.SequenceEqual(expectedDc));
+        }
+
+        [Fact]
+        public void Mean16x4_Works() => RunMean16x4Test();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void Mean16x4_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableSSE2);
+#endif
+    }
+}

From 90bab3939770a028a45e3d824dc6949fa124c492 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 8 Nov 2021 16:56:38 +1100
Subject: [PATCH 54/85] Special case La32 and L16

---
 src/ImageSharp/Color/Color.Conversions.cs | 22 ++++++++++++++++++++++
 src/ImageSharp/Color/Color.cs             |  8 ++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 96aa05c961..bf7869e53d 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -34,6 +34,28 @@ public Color(Rgb48 pixel)
             this.boxedHighPrecisionPixel = null;
         }
 
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Color"/> struct.
+        /// </summary>
+        /// <param name="pixel">The <see cref="La32"/> containing the color information.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public Color(La32 pixel)
+        {
+            this.data = new Rgba64(pixel.L, pixel.L, pixel.L, pixel.A);
+            this.boxedHighPrecisionPixel = null;
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Color"/> struct.
+        /// </summary>
+        /// <param name="pixel">The <see cref="L16"/> containing the color information.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public Color(L16 pixel)
+        {
+            this.data = new Rgba64(pixel.PackedValue, pixel.PackedValue, pixel.PackedValue, ushort.MaxValue);
+            this.boxedHighPrecisionPixel = null;
+        }
+
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index c461d034eb..7c21d62ddf 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -117,6 +117,14 @@ public static Color FromPixel<TPixel>(TPixel pixel)
             {
                 return new((Rgb48)(object)pixel);
             }
+            else if (typeof(TPixel) == typeof(La32))
+            {
+                return new((La32)(object)pixel);
+            }
+            else if (typeof(TPixel) == typeof(L16))
+            {
+                return new((L16)(object)pixel);
+            }
             else if (Unsafe.SizeOf<TPixel>() <= Unsafe.SizeOf<Rgba32>())
             {
                 Rgba32 p = default;

From 8d19c2881da8da3a7a88a569b6f7784bbc1c210c Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 8 Nov 2021 10:41:52 +0100
Subject: [PATCH 55/85] Add sse2 version of Vp8Sse4X4

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 59 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index d5db3dffa5..82e2214701 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,11 +4,16 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal static class LossyUtils
+    internal static unsafe class LossyUtils
     {
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
@@ -17,7 +22,57 @@ internal static class LossyUtils
         public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4);
+        public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+#pragma warning disable SA1503 // Braces should not be omitted
+                Span<int> tmp = stackalloc int[4];
+                fixed (byte* aPtr = a)
+                fixed (byte* bPtr = b)
+                fixed (int* tmpPtr = tmp)
+                {
+                    // Load values.
+                    Vector128<byte> a0 = Sse2.LoadVector128(aPtr);
+                    Vector128<byte> a1 = Sse2.LoadVector128(aPtr + WebpConstants.Bps);
+                    Vector128<byte> a2 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 2));
+                    Vector128<byte> a3 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 3));
+                    Vector128<byte> b0 = Sse2.LoadVector128(bPtr);
+                    Vector128<byte> b1 = Sse2.LoadVector128(bPtr + WebpConstants.Bps);
+                    Vector128<byte> b2 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 2));
+                    Vector128<byte> b3 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 3));
+
+                    // Combine pair of lines.
+                    Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
+                    Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
+                    Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
+                    Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
+
+                    // Convert to 16b.
+                    Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
+
+                    // subtract, square and accumulate.
+                    Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
+                    Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
+                    Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
+                    Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
+                    Vector128<int> sum = Sse2.Add(e0, e1);
+
+                    Sse2.Store(tmpPtr, sum);
+                    return tmp[3] + tmp[2] + tmp[1] + tmp[0];
+                }
+#pragma warning restore SA1503 // Braces should not be omitted
+            }
+            else
+#endif
+            {
+                return GetSse(a, b, 4, 4);
+            }
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)

From 5c6e08b80c39f3cd4e24774ee66b5b011c41aa00 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 8 Nov 2021 16:02:06 +0100
Subject: [PATCH 56/85] Avoid pinning of vp8 matrix data

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 169 +++++++++---------
 1 file changed, 85 insertions(+), 84 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index f935bd3ee0..b300b7b5c2 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -3,6 +3,7 @@
 
 using System;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -537,99 +538,99 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
             if (Sse41.IsSupported)
             {
 #pragma warning disable SA1503 // Braces should not be omitted
-                fixed (ushort* mtxIqPtr = mtx.IQ)
-                fixed (ushort* mtxQPtr = mtx.Q)
-                fixed (uint* biasQPtr = mtx.Bias)
-                fixed (short* sharpenPtr = mtx.Sharpen)
+                // Load all inputs.
+                Vector128<short> input0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input));
+                Vector128<short> input8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input.Slice(8, 8)));
+                Vector128<ushort> iq0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(0, 8)));
+                Vector128<ushort> iq8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(8, 8)));
+                Vector128<ushort> q0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(0, 8)));
+                Vector128<ushort> q8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(8, 8)));
+
+                // coeff = abs(in)
+                Vector128<ushort> coeff0 = Ssse3.Abs(input0);
+                Vector128<ushort> coeff8 = Ssse3.Abs(input8);
+
+                // coeff = abs(in) + sharpen
+                Vector128<short> sharpen0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(0, 8)));
+                Vector128<short> sharpen8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(8, 8)));
+                Sse2.Add(coeff0.AsInt16(), sharpen0);
+                Sse2.Add(coeff8.AsInt16(), sharpen8);
+
+                // out = (coeff * iQ + B) >> QFIX
+                // doing calculations with 32b precision (QFIX=17)
+                // out = (coeff * iQ)
+                Vector128<ushort> coeffiQ0H = Sse2.MultiplyHigh(coeff0, iq0);
+                Vector128<ushort> coeffiQ0L = Sse2.MultiplyLow(coeff0, iq0);
+                Vector128<ushort> coeffiQ8H = Sse2.MultiplyHigh(coeff8, iq8);
+                Vector128<ushort> coeffiQ8L = Sse2.MultiplyLow(coeff8, iq8);
+                Vector128<ushort> out00 = Sse2.UnpackLow(coeffiQ0L, coeffiQ0H);
+                Vector128<ushort> out04 = Sse2.UnpackHigh(coeffiQ0L, coeffiQ0H);
+                Vector128<ushort> out08 = Sse2.UnpackLow(coeffiQ8L, coeffiQ8H);
+                Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
+
+                // out = (coeff * iQ + B)
+                Vector128<uint> bias00 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(0, 4)));
+                Vector128<uint> bias04 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(4, 4)));
+                Vector128<uint> bias08 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(8, 4)));
+                Vector128<uint> bias12 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(12, 4)));
+                out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
+                out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
+                out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
+                out12 = Sse2.Add(out12.AsInt32(), bias12.AsInt32()).AsUInt16();
+
+                // out = QUANTDIV(coeff, iQ, B, QFIX)
+                out00 = Sse2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16();
+                out04 = Sse2.ShiftRightArithmetic(out04.AsInt32(), WebpConstants.QFix).AsUInt16();
+                out08 = Sse2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16();
+                out12 = Sse2.ShiftRightArithmetic(out12.AsInt32(), WebpConstants.QFix).AsUInt16();
+
+                // pack result as 16b
+                Vector128<short> out0 = Sse2.PackSignedSaturate(out00.AsInt32(), out04.AsInt32());
+                Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());
+
+                // if (coeff > 2047) coeff = 2047
+                out0 = Sse2.Min(out0, MaxCoeff2047);
+                out8 = Sse2.Min(out8, MaxCoeff2047);
+
+                // put sign back
+                out0 = Ssse3.Sign(out0, input0);
+                out8 = Ssse3.Sign(out8, input8);
+
+                // in = out * Q
+                input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
+                input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
+
                 fixed (short* inputPtr = input)
-                fixed (short* outputPtr = output)
                 {
-                    // Load all inputs.
-                    Vector128<short> input0 = Sse2.LoadVector128(inputPtr);
-                    Vector128<short> input8 = Sse2.LoadVector128(inputPtr + 8);
-                    Vector128<ushort> iq0 = Sse2.LoadVector128(mtxIqPtr);
-                    Vector128<ushort> iq8 = Sse2.LoadVector128(mtxIqPtr + 8);
-                    Vector128<ushort> q0 = Sse2.LoadVector128(mtxQPtr);
-                    Vector128<ushort> q8 = Sse2.LoadVector128(mtxQPtr + 8);
-
-                    // coeff = abs(in)
-                    Vector128<ushort> coeff0 = Ssse3.Abs(input0);
-                    Vector128<ushort> coeff8 = Ssse3.Abs(input8);
-
-                    // coeff = abs(in) + sharpen
-                    Vector128<short> sharpen0 = Sse2.LoadVector128(sharpenPtr);
-                    Vector128<short> sharpen8 = Sse2.LoadVector128(sharpenPtr + 8);
-                    Sse2.Add(coeff0.AsInt16(), sharpen0);
-                    Sse2.Add(coeff8.AsInt16(), sharpen8);
-
-                    // out = (coeff * iQ + B) >> QFIX
-                    // doing calculations with 32b precision (QFIX=17)
-                    // out = (coeff * iQ)
-                    Vector128<ushort> coeffiQ0H = Sse2.MultiplyHigh(coeff0, iq0);
-                    Vector128<ushort> coeffiQ0L = Sse2.MultiplyLow(coeff0, iq0);
-                    Vector128<ushort> coeffiQ8H = Sse2.MultiplyHigh(coeff8, iq8);
-                    Vector128<ushort> coeffiQ8L = Sse2.MultiplyLow(coeff8, iq8);
-                    Vector128<ushort> out00 = Sse2.UnpackLow(coeffiQ0L, coeffiQ0H);
-                    Vector128<ushort> out04 = Sse2.UnpackHigh(coeffiQ0L, coeffiQ0H);
-                    Vector128<ushort> out08 = Sse2.UnpackLow(coeffiQ8L, coeffiQ8H);
-                    Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
-
-                    // out = (coeff * iQ + B)
-                    Vector128<uint> bias00 = Sse2.LoadVector128(biasQPtr);
-                    Vector128<uint> bias04 = Sse2.LoadVector128(biasQPtr + 4);
-                    Vector128<uint> bias08 = Sse2.LoadVector128(biasQPtr + 8);
-                    Vector128<uint> bias12 = Sse2.LoadVector128(biasQPtr + 12);
-                    out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
-                    out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
-                    out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
-                    out12 = Sse2.Add(out12.AsInt32(), bias12.AsInt32()).AsUInt16();
-
-                    // out = QUANTDIV(coeff, iQ, B, QFIX)
-                    out00 = Sse2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16();
-                    out04 = Sse2.ShiftRightArithmetic(out04.AsInt32(), WebpConstants.QFix).AsUInt16();
-                    out08 = Sse2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16();
-                    out12 = Sse2.ShiftRightArithmetic(out12.AsInt32(), WebpConstants.QFix).AsUInt16();
-
-                    // pack result as 16b
-                    Vector128<short> out0 = Sse2.PackSignedSaturate(out00.AsInt32(), out04.AsInt32());
-                    Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());
-
-                    // if (coeff > 2047) coeff = 2047
-                    out0 = Sse2.Min(out0, MaxCoeff2047);
-                    out8 = Sse2.Min(out8, MaxCoeff2047);
-
-                    // put sign back
-                    out0 = Ssse3.Sign(out0, input0);
-                    out8 = Ssse3.Sign(out8, input8);
-
-                    // in = out * Q
-                    input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
-                    input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
-
                     // in = out * Q
                     Sse2.Store(inputPtr, input0);
                     Sse2.Store(inputPtr + 8, input8);
+                }
 
-                    // zigzag the output before storing it. The re-ordering is:
-                    //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
-                    // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
-                    // There's only two misplaced entries ([8] and [7]) that are crossing the
-                    // reg's boundaries.
-                    // We use pshufb instead of pshuflo/pshufhi.
-                    Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
-                    Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7);  // extract #7
-                    Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
-                    Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8);  // extract #8
-                    Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
-                    Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
+                // zigzag the output before storing it. The re-ordering is:
+                //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+                // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+                // There's only two misplaced entries ([8] and [7]) that are crossing the
+                // reg's boundaries.
+                // We use pshufb instead of pshuflo/pshufhi.
+                Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
+                Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7);  // extract #7
+                Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
+                Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8);  // extract #8
+                Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
+                Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
+
+                fixed (short* outputPtr = output)
+                {
                     Sse2.Store(outputPtr, outZ0.AsInt16());
                     Sse2.Store(outputPtr + 8, outZ8.AsInt16());
-                    Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
-
-                    // Detect if all 'out' values are zeroes or not.
-                    Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
-                    return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
                 }
+
+                Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
+
+                // Detect if all 'out' values are zeroes or not.
+                Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
+                return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
 #pragma warning restore SA1503 // Braces should not be omitted
             }
             else

From 0c0812de82648be40a35dc63a9b6c914bdcbbbf7 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 8 Nov 2021 16:58:40 +0100
Subject: [PATCH 57/85] Avoid pinning input and output data

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index b300b7b5c2..6e25dc003c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -600,12 +600,10 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
                 input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
                 input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
 
-                fixed (short* inputPtr = input)
-                {
-                    // in = out * Q
-                    Sse2.Store(inputPtr, input0);
-                    Sse2.Store(inputPtr + 8, input8);
-                }
+                // in = out * Q
+                ref short inputRef = ref MemoryMarshal.GetReference(input);
+                Unsafe.As<short, Vector128<short>>(ref inputRef) = input0;
+                Unsafe.As<short, Vector128<short>>(ref Unsafe.Add(ref inputRef, 8)) = input8;
 
                 // zigzag the output before storing it. The re-ordering is:
                 //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
@@ -620,11 +618,9 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
                 Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
                 Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
 
-                fixed (short* outputPtr = output)
-                {
-                    Sse2.Store(outputPtr, outZ0.AsInt16());
-                    Sse2.Store(outputPtr + 8, outZ8.AsInt16());
-                }
+                ref short outputRef = ref MemoryMarshal.GetReference(output);
+                Unsafe.As<short, Vector128<short>>(ref outputRef) = outZ0.AsInt16();
+                Unsafe.As<short, Vector128<short>>(ref Unsafe.Add(ref outputRef, 8)) = outZ8.AsInt16();
 
                 Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
 

From cffa4b0c366a3d80b7e5c315127ae0a27f1ddb8d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 8 Nov 2021 17:00:18 +0100
Subject: [PATCH 58/85] Only test with and without HardwareIntrinsics

---
 tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
index 280a7902ae..d0cdfc1ded 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -44,13 +44,7 @@ private static void RunQuantizeBlockTest()
         public void QuantizeBlock_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.AllowAll);
 
         [Fact]
-        public void QuantizeBlock_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2);
-
-        [Fact]
-        public void QuantizeBlock_WithoutSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSSE3);
-
-        [Fact]
-        public void QuantizeBlock_WithoutSSE2AndSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3);
+        public void QuantizeBlock_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableHWIntrinsic);
 #endif
     }
 }

From c9fc5cdb56a21deaf78ae4eb73a6e8270c951841 Mon Sep 17 00:00:00 2001
From: Berkan Diler <b.diler@gmx.de>
Date: Mon, 8 Nov 2021 18:33:24 +0100
Subject: [PATCH 59/85] Collapse AsSpan().Slice(..) calls into AsSpan(..)

---
 src/ImageSharp/Formats/Png/PngDecoderCore.cs                  | 2 +-
 src/ImageSharp/Formats/Webp/WebpDecoderCore.cs                | 2 +-
 src/ImageSharp/IO/ChunkedMemoryStream.cs                      | 4 ++--
 .../Processors/Transforms/Resize/ResizeKernelMap.cs           | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Png/PngDecoderCore.cs b/src/ImageSharp/Formats/Png/PngDecoderCore.cs
index 987dc150c2..cf3cd7eb14 100644
--- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs
+++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs
@@ -1071,7 +1071,7 @@ private bool TryUncompressTextData(ReadOnlySpan<byte> compressedData, Encoding e
                 int bytesRead = inflateStream.CompressedStream.Read(this.buffer, 0, this.buffer.Length);
                 while (bytesRead != 0)
                 {
-                    uncompressedBytes.AddRange(this.buffer.AsSpan().Slice(0, bytesRead).ToArray());
+                    uncompressedBytes.AddRange(this.buffer.AsSpan(0, bytesRead).ToArray());
                     bytesRead = inflateStream.CompressedStream.Read(this.buffer, 0, this.buffer.Length);
                 }
 
diff --git a/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs b/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
index 44a55a4c65..09071406c5 100644
--- a/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
+++ b/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
@@ -306,7 +306,7 @@ private WebpImageInfo ReadVp8Header(WebpFeatures features = null)
 
             // Check for VP8 magic bytes.
             this.currentStream.Read(this.buffer, 0, 3);
-            if (!this.buffer.AsSpan().Slice(0, 3).SequenceEqual(WebpConstants.Vp8HeaderMagicBytes))
+            if (!this.buffer.AsSpan(0, 3).SequenceEqual(WebpConstants.Vp8HeaderMagicBytes))
             {
                 WebpThrowHelper.ThrowImageFormatException("VP8 magic bytes not found");
             }
diff --git a/src/ImageSharp/IO/ChunkedMemoryStream.cs b/src/ImageSharp/IO/ChunkedMemoryStream.cs
index b9220c56ab..e28baf879d 100644
--- a/src/ImageSharp/IO/ChunkedMemoryStream.cs
+++ b/src/ImageSharp/IO/ChunkedMemoryStream.cs
@@ -243,7 +243,7 @@ public override int Read(byte[] buffer, int offset, int count)
             const string bufferMessage = "Offset subtracted from the buffer length is less than count.";
             Guard.IsFalse(buffer.Length - offset < count, nameof(buffer), bufferMessage);
 
-            return this.ReadImpl(buffer.AsSpan().Slice(offset, count));
+            return this.ReadImpl(buffer.AsSpan(offset, count));
         }
 
 #if SUPPORTS_SPAN_STREAM
@@ -359,7 +359,7 @@ public override void Write(byte[] buffer, int offset, int count)
             const string bufferMessage = "Offset subtracted from the buffer length is less than count.";
             Guard.IsFalse(buffer.Length - offset < count, nameof(buffer), bufferMessage);
 
-            this.WriteImpl(buffer.AsSpan().Slice(offset, count));
+            this.WriteImpl(buffer.AsSpan(offset, count));
         }
 
 #if SUPPORTS_SPAN_STREAM
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
index a58c20f687..9cc4680602 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
@@ -216,7 +216,7 @@ private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowI
 
             ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right);
 
-            Span<double> kernelValues = this.tempValues.AsSpan().Slice(0, kernel.Length);
+            Span<double> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
             double sum = 0;
 
             for (int j = left; j <= right; j++)

From 670e2eeafc14b7c16757f1b909eb552a9e61b1ca Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Tue, 9 Nov 2021 11:43:19 +1100
Subject: [PATCH 60/85] Update ColorTests.CastTo.cs

---
 .../ImageSharp.Tests/Color/ColorTests.CastTo.cs | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
index af35d1f895..3003265ca6 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
@@ -90,16 +90,25 @@ public void Bgr24()
             }
 
             [Fact]
-            public void TPixel()
+            public void GenericPixel()
             {
-                var source = new RgbaVector(float.Epsilon, 2 * float.Epsilon, float.MaxValue, float.MinValue);
+                AssertGenericPixel(new RgbaVector(float.Epsilon, 2 * float.Epsilon, float.MaxValue, float.MinValue));
+                AssertGenericPixel(new Rgba64(1, 2, ushort.MaxValue, ushort.MaxValue - 1));
+                AssertGenericPixel(new Rgb48(1, 2, ushort.MaxValue - 1));
+                AssertGenericPixel(new La32(1, ushort.MaxValue - 1));
+                AssertGenericPixel(new L16(ushort.MaxValue - 1));
+                AssertGenericPixel(new Rgba32(1, 2, 255, 254));
+            }
 
+            private static void AssertGenericPixel<TPixel>(TPixel source)
+                where TPixel : unmanaged, IPixel<TPixel>
+            {
                 // Act:
                 var color = Color.FromPixel(source);
 
                 // Assert:
-                RgbaVector data = color.ToPixel<RgbaVector>();
-                Assert.Equal(source, data);
+                TPixel actual = color.ToPixel<TPixel>();
+                Assert.Equal(source, actual);
             }
         }
     }

From cb513a905c52e843440f14c70e40fe9192737e91 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 11:05:18 +0100
Subject: [PATCH 61/85] Use fixed sized arrays in Vp8Matrix

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 20 ++++----
 .../Formats/Webp/Lossy/Vp8Encoder.cs          |  8 +---
 .../Formats/Webp/Lossy/Vp8Matrix.cs           | 47 +++++--------------
 .../Formats/Webp/Lossy/Vp8SegmentInfo.cs      | 12 ++---
 .../Formats/WebP/QuantEncTests.cs             | 17 ++++---
 5 files changed, 41 insertions(+), 63 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 6e25dc003c..4c3a2ff5e3 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -541,18 +541,18 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
                 // Load all inputs.
                 Vector128<short> input0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input));
                 Vector128<short> input8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input.Slice(8, 8)));
-                Vector128<ushort> iq0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(0, 8)));
-                Vector128<ushort> iq8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(8, 8)));
-                Vector128<ushort> q0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(0, 8)));
-                Vector128<ushort> q8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(8, 8)));
+                Vector128<ushort> iq0 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.IQ[0]);
+                Vector128<ushort> iq8 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.IQ[8]);
+                Vector128<ushort> q0 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.Q[0]);
+                Vector128<ushort> q8 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.Q[8]);
 
                 // coeff = abs(in)
                 Vector128<ushort> coeff0 = Ssse3.Abs(input0);
                 Vector128<ushort> coeff8 = Ssse3.Abs(input8);
 
                 // coeff = abs(in) + sharpen
-                Vector128<short> sharpen0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(0, 8)));
-                Vector128<short> sharpen8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(8, 8)));
+                Vector128<short> sharpen0 = Unsafe.As<short, Vector128<short>>(ref mtx.Sharpen[0]);
+                Vector128<short> sharpen8 = Unsafe.As<short, Vector128<short>>(ref mtx.Sharpen[8]);
                 Sse2.Add(coeff0.AsInt16(), sharpen0);
                 Sse2.Add(coeff8.AsInt16(), sharpen8);
 
@@ -569,10 +569,10 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
                 Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
 
                 // out = (coeff * iQ + B)
-                Vector128<uint> bias00 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(0, 4)));
-                Vector128<uint> bias04 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(4, 4)));
-                Vector128<uint> bias08 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(8, 4)));
-                Vector128<uint> bias12 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(12, 4)));
+                Vector128<uint> bias00 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[0]);
+                Vector128<uint> bias04 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[4]);
+                Vector128<uint> bias08 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[8]);
+                Vector128<uint> bias12 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[12]);
                 out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
                 out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
                 out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 728574682f..8a4115d216 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -502,7 +502,7 @@ private void SetLoopParams(float q)
             this.ResetStats();
         }
 
-        private void AdjustFilterStrength()
+        private unsafe void AdjustFilterStrength()
         {
             if (this.filterStrength > 0)
             {
@@ -806,7 +806,7 @@ private void ResetStats()
             proba.NbSkip = 0;
         }
 
-        private void SetupMatrices(Vp8SegmentInfo[] dqm)
+        private unsafe void SetupMatrices(Vp8SegmentInfo[] dqm)
         {
             int tlambdaScale = this.method >= WebpEncodingMethod.Default ? this.spatialNoiseShaping : 0;
             for (int i = 0; i < dqm.Length; i++)
@@ -814,10 +814,6 @@ private void SetupMatrices(Vp8SegmentInfo[] dqm)
                 Vp8SegmentInfo m = dqm[i];
                 int q = m.Quant;
 
-                m.Y1 = new Vp8Matrix();
-                m.Y2 = new Vp8Matrix();
-                m.Uv = new Vp8Matrix();
-
                 m.Y1.Q[0] = WebpLookupTables.DcTable[Numerics.Clamp(q + this.DqY1Dc, 0, 127)];
                 m.Y1.Q[1] = WebpLookupTables.AcTable[Numerics.Clamp(q, 0, 127)];
 
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
index e525e388b8..66c91e44ad 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
@@ -3,7 +3,7 @@
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal class Vp8Matrix
+    internal unsafe struct Vp8Matrix
     {
         private static readonly int[][] BiasMatrices =
         {
@@ -23,50 +23,29 @@ internal class Vp8Matrix
         private const int SharpenBits = 11;
 
         /// <summary>
-        /// Initializes a new instance of the <see cref="Vp8Matrix"/> class.
+        /// The quantizer steps.
         /// </summary>
-        public Vp8Matrix()
-        {
-            this.Q = new ushort[16];
-            this.IQ = new ushort[16];
-            this.Bias = new uint[16];
-            this.ZThresh = new uint[16];
-            this.Sharpen = new short[16];
-        }
-
-        public Vp8Matrix(ushort[] q, ushort[] iq, uint[] bias, uint[] zThresh, short[] sharpen)
-        {
-            this.Q = q;
-            this.IQ = iq;
-            this.Bias = bias;
-            this.ZThresh = zThresh;
-            this.Sharpen = sharpen;
-        }
-
-        /// <summary>
-        /// Gets the quantizer steps.
-        /// </summary>
-        public ushort[] Q { get; }
+        public fixed ushort Q[16];
 
         /// <summary>
-        /// Gets the reciprocals, fixed point.
+        /// The reciprocals, fixed point.
         /// </summary>
-        public ushort[] IQ { get; }
+        public fixed ushort IQ[16];
 
         /// <summary>
-        /// Gets the rounding bias.
+        /// The rounding bias.
         /// </summary>
-        public uint[] Bias { get; }
+        public fixed uint Bias[16];
 
         /// <summary>
-        /// Gets the value below which a coefficient is zeroed.
+        /// The value below which a coefficient is zeroed.
         /// </summary>
-        public uint[] ZThresh { get; }
+        public fixed uint ZThresh[16];
 
         /// <summary>
-        /// Gets the frequency boosters for slight sharpening.
+        /// The frequency boosters for slight sharpening.
         /// </summary>
-        public short[] Sharpen { get; }
+        public fixed short Sharpen[16];
 
         /// <summary>
         /// Returns the average quantizer.
@@ -81,7 +60,7 @@ public int Expand(int type)
                 int isAcCoeff = i > 0 ? 1 : 0;
                 int bias = BiasMatrices[type][isAcCoeff];
                 this.IQ[i] = (ushort)((1 << WebpConstants.QFix) / this.Q[i]);
-                this.Bias[i] = (uint)this.BIAS(bias);
+                this.Bias[i] = (uint)BIAS(bias);
 
                 // zthresh is the exact value such that QUANTDIV(coeff, iQ, B) is:
                 //   * zero if coeff <= zthresh
@@ -115,6 +94,6 @@ public int Expand(int type)
             return (sum + 8) >> 4;
         }
 
-        private int BIAS(int b) => b << (WebpConstants.QFix - 8);
+        private static int BIAS(int b) => b << (WebpConstants.QFix - 8);
     }
 }
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
index cf2a5c1775..71983055c0 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
@@ -8,19 +8,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     internal class Vp8SegmentInfo
     {
         /// <summary>
-        /// Gets or sets the quantization matrix y1.
+        /// Gets the quantization matrix y1.
         /// </summary>
-        public Vp8Matrix Y1 { get; set; }
+        public Vp8Matrix Y1;
 
         /// <summary>
-        /// Gets or sets the quantization matrix y2.
+        /// Gets the quantization matrix y2.
         /// </summary>
-        public Vp8Matrix Y2 { get; set; }
+        public Vp8Matrix Y2;
 
         /// <summary>
-        /// Gets or sets the quantization matrix uv.
+        /// Gets the quantization matrix uv.
         /// </summary>
-        public Vp8Matrix Uv { get; set; }
+        public Vp8Matrix Uv;
 
         /// <summary>
         /// Gets or sets the quant-susceptibility, range [-127,127]. Zero is neutral. Lower values indicate a lower risk of blurriness.
diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
index d0cdfc1ded..7465c42cef 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -11,22 +11,25 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
     [Trait("Format", "Webp")]
     public class QuantEncTests
     {
-        private static void RunQuantizeBlockTest()
+        private static unsafe void RunQuantizeBlockTest()
         {
             // arrange
             short[] input = { 378, 777, -851, 888, 259, 148, 0, -111, -185, -185, -74, -37, 148, 74, 111, 74 };
             short[] output = new short[16];
             ushort[] q = { 42, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37 };
             ushort[] iq = { 3120, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542 };
-            uint[] bias =
-            {
-                49152, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296,
-                55296, 55296
-            };
+            uint[] bias = { 49152, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296 };
             uint[] zthresh = { 26, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 };
             short[] expectedOutput = { 9, 21, 7, -5, 4, -23, 24, 0, -5, 4, 2, -2, -3, -1, 3, 2 };
             int expectedResult = 1;
-            var vp8Matrix = new Vp8Matrix(q, iq, bias, zthresh, new short[16]);
+            Vp8Matrix vp8Matrix = default;
+            for (int i = 0; i < 16; i++)
+            {
+                vp8Matrix.Q[i] = q[i];
+                vp8Matrix.IQ[i] = iq[i];
+                vp8Matrix.Bias[i] = bias[i];
+                vp8Matrix.ZThresh[i] = zthresh[i];
+            }
 
             // act
             int actualResult = QuantEnc.QuantizeBlock(input, output, vp8Matrix);

From 3c9c1bb23eb63863fcac38ac4478f097d73e1e0f Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 11:21:18 +0100
Subject: [PATCH 62/85] Avoid pinning

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 48 +++++++++----------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 74448cf528..6de2989bda 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,6 +4,7 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -814,33 +815,28 @@ public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
             {
-#pragma warning disable SA1503 // Braces should not be omitted
                 tmp.Clear();
-                fixed (byte* inputPtr = input)
-                fixed (ushort* tmpPtr = tmp)
-                {
-                    Vector128<byte> a0 = Sse2.LoadVector128(inputPtr);
-                    Vector128<byte> a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps);
-                    Vector128<byte> a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2));
-                    Vector128<byte> a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3));
-                    Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
-                    Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
-                    Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
-                    Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
-                    Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
-                    Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
-                    Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
-                    Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
-                    Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
-                    Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
-                    Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
-                    Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
-                    Vector128<int> e0 = Sse2.Add(d0, d1);
-                    Vector128<int> e1 = Sse2.Add(d2, d3);
-                    Vector128<int> f0 = Sse2.Add(e0, e1);
-                    Sse2.Store(tmpPtr, f0.AsUInt16());
-                }
-#pragma warning restore SA1503 // Braces should not be omitted
+                Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input));
+                Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16)));
+                Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16)));
+                Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16)));
+                Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
+                Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
+                Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
+                Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
+                Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
+                Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
+                Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
+                Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
+                Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
+                Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
+                Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
+                Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
+                Vector128<int> e0 = Sse2.Add(d0, d1);
+                Vector128<int> e1 = Sse2.Add(d2, d3);
+                Vector128<int> f0 = Sse2.Add(e0, e1);
+                ref ushort outputRef = ref MemoryMarshal.GetReference(tmp);
+                Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = f0.AsUInt16();
 
                 dc[0] = (uint)(tmp[1] + tmp[0]);
                 dc[1] = (uint)(tmp[3] + tmp[2]);

From 6e135cbd79f391f56ee69df0da2b8be505631491 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 12:38:41 +0100
Subject: [PATCH 63/85] Avoid pinning

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 219 +++++++++---------
 1 file changed, 107 insertions(+), 112 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index b8f232a43b..ee224e0b0b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,6 +4,7 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -614,120 +615,114 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
         {
             Span<int> sum = scratch.Slice(0, 4);
             sum.Clear();
-#pragma warning disable SA1503 // Braces should not be omitted
-            fixed (byte* inputAPtr = inputA)
-            fixed (byte* inputBPtr = inputB)
-            fixed (ushort* wPtr = w)
-            fixed (int* outputPtr = sum)
-            {
-                // Load and combine inputs.
-                Vector128<byte> ina0 = Sse2.LoadVector128(inputAPtr);
-                Vector128<byte> ina1 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 1));
-                Vector128<byte> ina2 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 2));
-                Vector128<long> ina3 = Sse2.LoadVector128((long*)(inputAPtr + (WebpConstants.Bps * 3)));
-                Vector128<byte> inb0 = Sse2.LoadVector128(inputBPtr);
-                Vector128<byte> inb1 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 1));
-                Vector128<byte> inb2 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 2));
-                Vector128<long> inb3 = Sse2.LoadVector128((long*)(inputBPtr + (WebpConstants.Bps * 3)));
-
-                // Combine inA and inB (we'll do two transforms in parallel).
-                Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
-                Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
-                Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
-                Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
-                Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
-                Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
-                Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
-                Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
-
-                // a00 a01 a02 a03   b00 b01 b02 b03
-                // a10 a11 a12 a13   b10 b11 b12 b13
-                // a20 a21 a22 a23   b20 b21 b22 b23
-                // a30 a31 a32 a33   b30 b31 b32 b33
-                // Vertical pass first to avoid a transpose (vertical and horizontal passes
-                // are commutative because w/kWeightY is symmetric) and subsequent transpose.
-                // Calculate a and b (two 4x4 at once).
-                Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
-                Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
-                Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
-                Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
-                Vector128<short> b0 = Sse2.Add(a0, a1);
-                Vector128<short> b1 = Sse2.Add(a3, a2);
-                Vector128<short> b2 = Sse2.Subtract(a3, a2);
-                Vector128<short> b3 = Sse2.Subtract(a0, a1);
-
-                // a00 a01 a02 a03   b00 b01 b02 b03
-                // a10 a11 a12 a13   b10 b11 b12 b13
-                // a20 a21 a22 a23   b20 b21 b22 b23
-                // a30 a31 a32 a33   b30 b31 b32 b33
-                // Transpose the two 4x4.
-                Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
-                Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
-                Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
-                Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
-
-                // a00 a10 a01 a11   a02 a12 a03 a13
-                // a20 a30 a21 a31   a22 a32 a23 a33
-                // b00 b10 b01 b11   b02 b12 b03 b13
-                // b20 b30 b21 b31   b22 b32 b23 b33
-                Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
-                Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
-                Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
-                Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
-
-                // a00 a10 a20 a30 a01 a11 a21 a31
-                // b00 b10 b20 b30 b01 b11 b21 b31
-                // a02 a12 a22 a32 a03 a13 a23 a33
-                // b02 b12 a22 b32 b03 b13 b23 b33
-                Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
-                Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
-                Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
-                Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
-
-                // a00 a10 a20 a30   b00 b10 b20 b30
-                // a01 a11 a21 a31   b01 b11 b21 b31
-                // a02 a12 a22 a32   b02 b12 b22 b32
-                // a03 a13 a23 a33   b03 b13 b23 b33
-                // Horizontal pass and difference of weighted sums.
-                Vector128<ushort> w0 = Sse2.LoadVector128(wPtr);
-                Vector128<ushort> w8 = Sse2.LoadVector128(wPtr + 8);
-
-                // Calculate a and b (two 4x4 at once).
-                a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
-                a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
-                a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
-                a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
-                b0 = Sse2.Add(a0, a1);
-                b1 = Sse2.Add(a3, a2);
-                b2 = Sse2.Subtract(a3, a2);
-                b3 = Sse2.Subtract(a0, a1);
-
-                // Separate the transforms of inA and inB.
-                Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
-                Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
-                Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
-                Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
-
-                Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
-                Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
-                Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
-                Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
-
-                // weighted sums.
-                Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
-                Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
-                Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
-                Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
-                Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
-                Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
-
-                // difference of weighted sums.
-                Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
-                Sse2.Store(outputPtr, result.AsInt32());
-            }
 
+            // Load and combine inputs.
+            Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
+            Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
+            Vector128<byte> ina2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 2, 16)));
+            Vector128<long> ina3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
+            Vector128<byte> inb0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB));
+            Vector128<byte> inb1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps, 16)));
+            Vector128<byte> inb2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 2, 16)));
+            Vector128<long> inb3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
+
+            // Combine inA and inB (we'll do two transforms in parallel).
+            Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
+            Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
+            Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
+            Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
+            Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
+            Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
+            Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
+            Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
+
+            // a00 a01 a02 a03   b00 b01 b02 b03
+            // a10 a11 a12 a13   b10 b11 b12 b13
+            // a20 a21 a22 a23   b20 b21 b22 b23
+            // a30 a31 a32 a33   b30 b31 b32 b33
+            // Vertical pass first to avoid a transpose (vertical and horizontal passes
+            // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+            // Calculate a and b (two 4x4 at once).
+            Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
+            Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
+            Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
+            Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
+            Vector128<short> b0 = Sse2.Add(a0, a1);
+            Vector128<short> b1 = Sse2.Add(a3, a2);
+            Vector128<short> b2 = Sse2.Subtract(a3, a2);
+            Vector128<short> b3 = Sse2.Subtract(a0, a1);
+
+            // a00 a01 a02 a03   b00 b01 b02 b03
+            // a10 a11 a12 a13   b10 b11 b12 b13
+            // a20 a21 a22 a23   b20 b21 b22 b23
+            // a30 a31 a32 a33   b30 b31 b32 b33
+            // Transpose the two 4x4.
+            Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
+            Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
+            Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
+            Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
+
+            // a00 a10 a01 a11   a02 a12 a03 a13
+            // a20 a30 a21 a31   a22 a32 a23 a33
+            // b00 b10 b01 b11   b02 b12 b03 b13
+            // b20 b30 b21 b31   b22 b32 b23 b33
+            Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
+            Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
+
+            // a00 a10 a20 a30 a01 a11 a21 a31
+            // b00 b10 b20 b30 b01 b11 b21 b31
+            // a02 a12 a22 a32 a03 a13 a23 a33
+            // b02 b12 a22 b32 b03 b13 b23 b33
+            Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
+            Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
+            Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
+            Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+
+            // a00 a10 a20 a30   b00 b10 b20 b30
+            // a01 a11 a21 a31   b01 b11 b21 b31
+            // a02 a12 a22 a32   b02 b12 b22 b32
+            // a03 a13 a23 a33   b03 b13 b23 b33
+            // Horizontal pass and difference of weighted sums.
+            Vector128<ushort> w0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w));
+            Vector128<ushort> w8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w.Slice(8, 8)));
+
+            // Calculate a and b (two 4x4 at once).
+            a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
+            a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
+            a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
+            a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
+            b0 = Sse2.Add(a0, a1);
+            b1 = Sse2.Add(a3, a2);
+            b2 = Sse2.Subtract(a3, a2);
+            b3 = Sse2.Subtract(a0, a1);
+
+            // Separate the transforms of inA and inB.
+            Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
+            Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
+            Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
+            Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
+
+            Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
+            Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
+            Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
+            Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
+
+            // weighted sums.
+            Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
+            Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
+            Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
+            Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
+            Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
+            Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
+
+            // difference of weighted sums.
+            Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
+
+            ref int outputRef = ref MemoryMarshal.GetReference(sum);
+            Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
             return sum[3] + sum[2] + sum[1] + sum[0];
-#pragma warning restore SA1503 // Braces should not be omitted
         }
 #endif
 

From d6d1868343831184d94482895e5f4d3837e643cf Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 12:40:27 +0100
Subject: [PATCH 64/85] Test Hadamard transform only with and without
 HardwareIntrinsics

---
 tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
index 349a0c8fca..f8b488fde5 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -45,13 +45,7 @@ private static void RunHadamardTransformTest()
         public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
 
         [Fact]
-        public void HadamardTransform_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE2);
-
-        [Fact]
-        public void HadamardTransform_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41);
-
-        [Fact]
-        public void HadamardTransform_WithoutSSE2AndSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41 | HwIntrinsics.DisableSSE2);
+        public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
 #endif
 
     }

From 99a3510e279a38a8c7c733d1c29f63fb3772d49d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 12:53:54 +0100
Subject: [PATCH 65/85] Avoid pinning

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 72 +++++++++----------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 82e2214701..aa35f9673c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,6 +4,7 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Numerics;
 using System.Runtime.Intrinsics;
@@ -27,45 +28,40 @@ public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
             {
-#pragma warning disable SA1503 // Braces should not be omitted
                 Span<int> tmp = stackalloc int[4];
-                fixed (byte* aPtr = a)
-                fixed (byte* bPtr = b)
-                fixed (int* tmpPtr = tmp)
-                {
-                    // Load values.
-                    Vector128<byte> a0 = Sse2.LoadVector128(aPtr);
-                    Vector128<byte> a1 = Sse2.LoadVector128(aPtr + WebpConstants.Bps);
-                    Vector128<byte> a2 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 2));
-                    Vector128<byte> a3 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 3));
-                    Vector128<byte> b0 = Sse2.LoadVector128(bPtr);
-                    Vector128<byte> b1 = Sse2.LoadVector128(bPtr + WebpConstants.Bps);
-                    Vector128<byte> b2 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 2));
-                    Vector128<byte> b3 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 3));
-
-                    // Combine pair of lines.
-                    Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
-                    Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
-                    Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
-                    Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
-
-                    // Convert to 16b.
-                    Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
-
-                    // subtract, square and accumulate.
-                    Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
-                    Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
-                    Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
-                    Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
-                    Vector128<int> sum = Sse2.Add(e0, e1);
-
-                    Sse2.Store(tmpPtr, sum);
-                    return tmp[3] + tmp[2] + tmp[1] + tmp[0];
-                }
-#pragma warning restore SA1503 // Braces should not be omitted
+
+                // Load values.
+                Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a));
+                Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps, 8)));
+                Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 2, 8)));
+                Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 3, 8)));
+                Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b));
+                Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps, 8)));
+                Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps * 2, 8)));
+                Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps * 3, 8)));
+
+                // Combine pair of lines.
+                Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
+                Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
+                Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
+                Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
+
+                // Convert to 16b.
+                Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
+
+                // subtract, square and accumulate.
+                Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
+                Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
+                Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
+                Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
+                Vector128<int> sum = Sse2.Add(e0, e1);
+
+                ref int outputRef = ref MemoryMarshal.GetReference(tmp);
+                Unsafe.As<int, Vector128<int>>(ref outputRef) = sum;
+                return tmp[3] + tmp[2] + tmp[1] + tmp[0];
             }
             else
 #endif

From 42c2cf7a799af7c5a6b504ec6233fc6a7308c030 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 13:40:40 +0100
Subject: [PATCH 66/85] Disable SA1401 in file: Fields should be private

---
 src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
index 71983055c0..2ce383d9e1 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
@@ -10,6 +10,7 @@ internal class Vp8SegmentInfo
         /// <summary>
         /// Gets the quantization matrix y1.
         /// </summary>
+#pragma warning disable SA1401 // Fields should be private
         public Vp8Matrix Y1;
 
         /// <summary>
@@ -21,6 +22,7 @@ internal class Vp8SegmentInfo
         /// Gets the quantization matrix uv.
         /// </summary>
         public Vp8Matrix Uv;
+#pragma warning restore SA1401 // Fields should be private
 
         /// <summary>
         /// Gets or sets the quant-susceptibility, range [-127,127]. Zero is neutral. Lower values indicate a lower risk of blurriness.

From 8160a0eeb6a7bb5e8dc65ca1827a754d5a0e1e81 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 13:40:54 +0100
Subject: [PATCH 67/85] Pass Vp8Matrix as ref

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 34 +++++++++----------
 .../Formats/WebP/QuantEncTests.cs             |  2 +-
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 4c3a2ff5e3..97ef27d259 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -315,14 +315,14 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
             }
 
             Vp8Encoding.FTransformWht(tmp, dcTmp, scratch);
-            nz |= QuantizeBlock(dcTmp, rd.YDcLevels, dqm.Y2) << 24;
+            nz |= QuantizeBlock(dcTmp, rd.YDcLevels, ref dqm.Y2) << 24;
 
             for (n = 0; n < 16; n += 2)
             {
                 // Zero-out the first coeff, so that: a) nz is correct below, and
                 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
                 tmp[n * 16] = tmp[(n + 1) * 16] = 0;
-                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.YAcLevels.AsSpan(n * 16, 32), dqm.Y1) << n;
+                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.YAcLevels.AsSpan(n * 16, 32), ref dqm.Y1) << n;
             }
 
             // Transform back.
@@ -343,7 +343,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
             tmp.Clear();
             scratch.Clear();
             Vp8Encoding.FTransform(src, reference, tmp, scratch);
-            int nz = QuantizeBlock(tmp, levels, dqm.Y1);
+            int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
             Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
 
             return nz;
@@ -370,11 +370,11 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
                     scratch);
             }
 
-            CorrectDcValues(it, dqm.Uv, tmp, rd);
+            CorrectDcValues(it, ref dqm.Uv, tmp, rd);
 
             for (n = 0; n < 8; n += 2)
             {
-                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.UvLevels.AsSpan(n * 16, 32), dqm.Uv) << n;
+                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.UvLevels.AsSpan(n * 16, 32), ref dqm.Uv) << n;
             }
 
             for (n = 0; n < 8; n += 2)
@@ -525,19 +525,18 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Quantize2Blocks(Span<short> input, Span<short> output, Vp8Matrix mtx)
+        public static int Quantize2Blocks(Span<short> input, Span<short> output, ref Vp8Matrix mtx)
         {
-            int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), mtx) << 0;
-            nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), mtx) << 1;
+            int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), ref mtx) << 0;
+            nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), ref mtx) << 1;
             return nz;
         }
 
-        public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix mtx)
+        public static int QuantizeBlock(Span<short> input, Span<short> output, ref Vp8Matrix mtx)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse41.IsSupported)
             {
-#pragma warning disable SA1503 // Braces should not be omitted
                 // Load all inputs.
                 Vector128<short> input0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input));
                 Vector128<short> input8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input.Slice(8, 8)));
@@ -624,10 +623,9 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
 
                 Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
 
-                // Detect if all 'out' values are zeroes or not.
+                // Detect if all 'out' values are zeros or not.
                 Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
                 return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
-#pragma warning restore SA1503 // Braces should not be omitted
             }
             else
 #endif
@@ -675,7 +673,7 @@ public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix
 
         // Quantize as usual, but also compute and return the quantization error.
         // Error is already divided by DSHIFT.
-        public static int QuantizeSingle(Span<short> v, Vp8Matrix mtx)
+        public static int QuantizeSingle(Span<short> v, ref Vp8Matrix mtx)
         {
             int v0 = v[0];
             bool sign = v0 < 0;
@@ -696,7 +694,7 @@ public static int QuantizeSingle(Span<short> v, Vp8Matrix mtx)
             return (sign ? -v0 : v0) >> DSCALE;
         }
 
-        public static void CorrectDcValues(Vp8EncIterator it, Vp8Matrix mtx, Span<short> tmp, Vp8ModeScore rd)
+        public static void CorrectDcValues(Vp8EncIterator it, ref Vp8Matrix mtx, Span<short> tmp, Vp8ModeScore rd)
         {
 #pragma warning disable SA1005 // Single line comments should begin with single space
             //         | top[0] | top[1]
@@ -713,13 +711,13 @@ public static void CorrectDcValues(Vp8EncIterator it, Vp8Matrix mtx, Span<short>
                 Span<sbyte> left = it.LeftDerr.AsSpan(ch, 2);
                 Span<short> c = tmp.Slice(ch * 4 * 16, 4 * 16);
                 c[0] += (short)(((C1 * top[0]) + (C2 * left[0])) >> (DSHIFT - DSCALE));
-                int err0 = QuantizeSingle(c, mtx);
+                int err0 = QuantizeSingle(c, ref mtx);
                 c[1 * 16] += (short)(((C1 * top[1]) + (C2 * err0)) >> (DSHIFT - DSCALE));
-                int err1 = QuantizeSingle(c.Slice(1 * 16), mtx);
+                int err1 = QuantizeSingle(c.Slice(1 * 16), ref mtx);
                 c[2 * 16] += (short)(((C1 * err0) + (C2 * left[1])) >> (DSHIFT - DSCALE));
-                int err2 = QuantizeSingle(c.Slice(2 * 16), mtx);
+                int err2 = QuantizeSingle(c.Slice(2 * 16), ref mtx);
                 c[3 * 16] += (short)(((C1 * err1) + (C2 * err2)) >> (DSHIFT - DSCALE));
-                int err3 = QuantizeSingle(c.Slice(3 * 16), mtx);
+                int err3 = QuantizeSingle(c.Slice(3 * 16), ref mtx);
 
                 rd.Derr[ch, 0] = err1;
                 rd.Derr[ch, 1] = err2;
diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
index 7465c42cef..55738199b7 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -32,7 +32,7 @@ private static unsafe void RunQuantizeBlockTest()
             }
 
             // act
-            int actualResult = QuantEnc.QuantizeBlock(input, output, vp8Matrix);
+            int actualResult = QuantEnc.QuantizeBlock(input, output, ref vp8Matrix);
 
             // assert
             Assert.True(output.SequenceEqual(expectedOutput));

From 1418e53bfbb719c36d57f4ac46317ca990d2fba2 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 14:58:31 +0100
Subject: [PATCH 68/85] Remove not need Clear of tmp buffer

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 408f6f066f..7c262a30ee 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -947,7 +947,6 @@ public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
             {
-                tmp.Clear();
                 Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input));
                 Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16)));
                 Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16)));

From 3cfa040b2099a5c91c8b1e15e5f2fd4c440a6f77 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 15:38:20 +0100
Subject: [PATCH 69/85] Use Ssse3.HorizontalAdd

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs    | 14 +++++++-------
 .../Formats/Webp/Lossy/Vp8EncIterator.cs           |  2 +-
 .../Formats/WebP/LossyUtilsTests.cs                |  5 ++---
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 7c262a30ee..5b27af821d 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -942,7 +942,7 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride,
             FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh);
         }
 
-        public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
+        public static void Mean16x4(Span<byte> input, Span<uint> dc)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
@@ -966,13 +966,13 @@ public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
                 Vector128<int> e0 = Sse2.Add(d0, d1);
                 Vector128<int> e1 = Sse2.Add(d2, d3);
                 Vector128<int> f0 = Sse2.Add(e0, e1);
-                ref ushort outputRef = ref MemoryMarshal.GetReference(tmp);
-                Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = f0.AsUInt16();
+                Vector128<short> hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16());
+                Vector64<short> lower = hadd.GetLower();
 
-                dc[0] = (uint)(tmp[1] + tmp[0]);
-                dc[1] = (uint)(tmp[3] + tmp[2]);
-                dc[2] = (uint)(tmp[5] + tmp[4]);
-                dc[3] = (uint)(tmp[7] + tmp[6]);
+                dc[0] = (uint)lower.GetElement(0);
+                dc[1] = (uint)lower.GetElement(1);
+                dc[2] = (uint)lower.GetElement(2);
+                dc[3] = (uint)lower.GetElement(3);
             }
             else
 #endif
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
index 57e18832ed..6279aef656 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
@@ -363,7 +363,7 @@ public int FastMbAnalyze(int quality)
             uint m2;
             for (k = 0; k < 16; k += 4)
             {
-                LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp);
+                LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4));
             }
 
             for (m = 0, m2 = 0, k = 0; k < 16; k++)
diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
index 16b8e11660..09727293ce 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -25,11 +25,10 @@ private static void RunMean16x4Test()
                 173, 175, 166, 155, 155, 159, 159, 158
             };
             uint[] dc = new uint[4];
-            ushort[] tmp = new ushort[8];
             uint[] expectedDc = { 1940, 2139, 2252, 1813 };
 
             // act
-            LossyUtils.Mean16x4(input, dc, tmp);
+            LossyUtils.Mean16x4(input, dc);
 
             // assert
             Assert.True(dc.SequenceEqual(expectedDc));
@@ -73,7 +72,7 @@ private static void RunHadamardTransformTest()
         public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
 
         [Fact]
-        public void Mean16x4_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableSSE2);
+        public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
 
         [Fact]
         public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);

From 84732bf14722ef50e01f1fd21c6c86e61a77eae2 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 15:39:16 +0100
Subject: [PATCH 70/85] Reverse access to bgr

---
 src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
index 24143785ab..a9cf876c80 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@@ -307,9 +307,9 @@ public static uint LoadUv(byte u, byte v) =>
         [MethodImpl(InliningOptions.ShortMethod)]
         public static void YuvToBgr(int y, int u, int v, Span<byte> bgr)
         {
-            bgr[0] = (byte)YuvToB(y, u);
-            bgr[1] = (byte)YuvToG(y, u, v);
             bgr[2] = (byte)YuvToR(y, v);
+            bgr[1] = (byte)YuvToG(y, u, v);
+            bgr[0] = (byte)YuvToB(y, u);
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]

From 50013d70f28c2d67e1a7e96e61174460e67fbc7f Mon Sep 17 00:00:00 2001
From: Brian Popow <38701097+brianpopow@users.noreply.github.com>
Date: Tue, 9 Nov 2021 15:51:02 +0100
Subject: [PATCH 71/85] Update src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

Reverse access to dc

Co-authored-by: James Jackson-South <james_south@hotmail.com>
---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 5b27af821d..e6a4e61701 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -969,10 +969,10 @@ public static void Mean16x4(Span<byte> input, Span<uint> dc)
                 Vector128<short> hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16());
                 Vector64<short> lower = hadd.GetLower();
 
-                dc[0] = (uint)lower.GetElement(0);
-                dc[1] = (uint)lower.GetElement(1);
-                dc[2] = (uint)lower.GetElement(2);
                 dc[3] = (uint)lower.GetElement(3);
+                dc[2] = (uint)lower.GetElement(2);
+                dc[1] = (uint)lower.GetElement(1);
+                dc[0] = (uint)lower.GetElement(0);
             }
             else
 #endif

From f0cb89e811be0fefc6a5a4d2f76797e7a2d8822c Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 16:36:42 +0100
Subject: [PATCH 72/85] Change IsSupported check from SSE2 to Ssse3

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index e6a4e61701..4ef9c56947 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -945,7 +945,7 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride,
         public static void Mean16x4(Span<byte> input, Span<uint> dc)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Sse2.IsSupported)
+            if (Ssse3.IsSupported)
             {
                 Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input));
                 Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16)));

From 1452ba00836cca274719844100259606750d56b7 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 16:40:55 +0100
Subject: [PATCH 73/85] Remove not needed GetLower

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 4ef9c56947..ac3b1d3806 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -967,12 +967,11 @@ public static void Mean16x4(Span<byte> input, Span<uint> dc)
                 Vector128<int> e1 = Sse2.Add(d2, d3);
                 Vector128<int> f0 = Sse2.Add(e0, e1);
                 Vector128<short> hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16());
-                Vector64<short> lower = hadd.GetLower();
 
-                dc[3] = (uint)lower.GetElement(3);
-                dc[2] = (uint)lower.GetElement(2);
-                dc[1] = (uint)lower.GetElement(1);
-                dc[0] = (uint)lower.GetElement(0);
+                dc[3] = (uint)hadd.GetElement(3);
+                dc[2] = (uint)hadd.GetElement(2);
+                dc[1] = (uint)hadd.GetElement(1);
+                dc[0] = (uint)hadd.GetElement(0);
             }
             else
 #endif

From de3140bbc29f4914425564538c849731b531dbeb Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 16:58:48 +0100
Subject: [PATCH 74/85] Use Numerics.ReduceSum(sum)

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index c1af2a4534..5b7d4d8981 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -27,8 +27,6 @@ public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
             {
-                Span<int> tmp = stackalloc int[4];
-
                 // Load values.
                 Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a));
                 Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps, 8)));
@@ -58,9 +56,7 @@ public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
                 Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
                 Vector128<int> sum = Sse2.Add(e0, e1);
 
-                ref int outputRef = ref MemoryMarshal.GetReference(tmp);
-                Unsafe.As<int, Vector128<int>>(ref outputRef) = sum;
-                return tmp[3] + tmp[2] + tmp[1] + tmp[0];
+                return Numerics.ReduceSum(sum);
             }
             else
 #endif
@@ -658,9 +654,6 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
         /// </summary>
         public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
         {
-            Span<int> sum = scratch.Slice(0, 4);
-            sum.Clear();
-
             // Load and combine inputs.
             Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
             Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
@@ -765,9 +758,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
             // difference of weighted sums.
             Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
 
-            ref int outputRef = ref MemoryMarshal.GetReference(sum);
-            Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
-            return sum[3] + sum[2] + sum[1] + sum[0];
+            return Numerics.ReduceSum(result);
         }
 #endif
 

From 80a826f506ae94372b488c099969abd95dc6d16e Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 17:28:30 +0100
Subject: [PATCH 75/85] Remove not needed clear

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 5b7d4d8981..febca037b5 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -13,7 +13,7 @@
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal static unsafe class LossyUtils
+    internal static class LossyUtils
     {
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
@@ -771,7 +771,6 @@ public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scrat
         public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
         {
             Span<int> tmp = scratch.Slice(0, 16);
-            tmp.Clear();
             int tmpOffset = 0;
             for (int srcOffset = 0; srcOffset < 4; srcOffset++)
             {

From 5abd7740e81d8d54bd24db235c3f90e1e5d02803 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 18:05:14 +0100
Subject: [PATCH 76/85] Add Vp8Sse4X4 sse tests

---
 .../Formats/WebP/LossyUtilsTests.cs           | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
index f8b488fde5..15b312835d 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -10,6 +10,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
     [Trait("Format", "Webp")]
     public class LossyUtilsTests
     {
+        private static void RunVp8Sse4X4Test()
+        {
+            byte[] a =
+            {
+                27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
+                129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
+                28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
+                26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
+                128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27,
+                129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128
+            };
+
+            byte[] b =
+            {
+                26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+                28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26,
+                26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+                204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204
+            };
+
+            int expected = 27;
+
+            int actual = LossyUtils.Vp8Sse4X4(a, b);
+
+            Assert.Equal(expected, actual);
+        }
+
         private static void RunHadamardTransformTest()
         {
             byte[] a =
@@ -37,10 +66,19 @@ private static void RunHadamardTransformTest()
             Assert.Equal(expected, actual);
         }
 
+        [Fact]
+        public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();
+
         [Fact]
         public void HadamardTransform_Works() => RunHadamardTransformTest();
 
 #if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);
+
         [Fact]
         public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
 

From 5ead84416dfc37e7fa41a36a9d58e15ac85d4232 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 18:56:30 +0100
Subject: [PATCH 77/85] Use Array.Clear to reset the arrays

---
 .../Formats/Webp/Lossy/Vp8ModeScore.cs          | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
index 1c92a9d2d9..69841b557e 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
@@ -97,18 +97,11 @@ public Vp8ModeScore()
 
         public void Clear()
         {
-            this.YDcLevels.AsSpan().Clear();
-            this.YAcLevels.AsSpan().Clear();
-            this.UvLevels.AsSpan().Clear();
-            this.ModesI4.AsSpan().Clear();
-
-            for (int i = 0; i < 2; i++)
-            {
-                for (int j = 0; j < 3; j++)
-                {
-                    this.Derr[i, j] = 0;
-                }
-            }
+            Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length);
+            Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length);
+            Array.Clear(this.UvLevels, 0, this.UvLevels.Length);
+            Array.Clear(this.ModesI4, 0, this.ModesI4.Length);
+            Array.Clear(this.Derr, 0, this.Derr.Length);
         }
 
         public void InitScore()

From 7d8225b59a633b08b51e74bbb960d4d52b420a84 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 19:38:12 +0100
Subject: [PATCH 78/85] Use UnpackLow to set the dc values

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index ac3b1d3806..3064ccc030 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -13,7 +13,7 @@
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal static unsafe class LossyUtils
+    internal static class LossyUtils
     {
 #if SUPPORTS_RUNTIME_INTRINSICS
         private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
@@ -967,11 +967,10 @@ public static void Mean16x4(Span<byte> input, Span<uint> dc)
                 Vector128<int> e1 = Sse2.Add(d2, d3);
                 Vector128<int> f0 = Sse2.Add(e0, e1);
                 Vector128<short> hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16());
+                Vector128<uint> wide = Sse2.UnpackLow(hadd, Vector128<short>.Zero).AsUInt32();
 
-                dc[3] = (uint)hadd.GetElement(3);
-                dc[2] = (uint)hadd.GetElement(2);
-                dc[1] = (uint)hadd.GetElement(1);
-                dc[0] = (uint)hadd.GetElement(0);
+                ref uint outputRef = ref MemoryMarshal.GetReference(dc);
+                Unsafe.As<uint, Vector128<uint>>(ref outputRef) = wide;
             }
             else
 #endif

From 7312b1a8389c1824409205a5bbfd4ad14224d9c3 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 19:43:49 +0100
Subject: [PATCH 79/85] Dont use slice

---
 .../Formats/Webp/Lossy/LossyUtils.cs           | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index febca037b5..19a71c3e56 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -28,14 +28,16 @@ public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
             if (Sse2.IsSupported)
             {
                 // Load values.
-                Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a));
-                Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps, 8)));
-                Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 2, 8)));
-                Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 3, 8)));
-                Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b));
-                Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps, 8)));
-                Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps * 2, 8)));
-                Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps * 3, 8)));
+                ref byte aRef = ref MemoryMarshal.GetReference(a);
+                Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
+                Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
+                Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
+                Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));
+                ref byte bRef = ref MemoryMarshal.GetReference(b);
+                Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef);
+                Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps));
+                Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2));
+                Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));
 
                 // Combine pair of lines.
                 Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());

From 3dd7c8ea41709173759b02eff4c51268eb2c9f33 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 20:55:23 +0100
Subject: [PATCH 80/85] Remove unnecessary Clear() and scratch buffer

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs   | 4 ++--
 src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 19a71c3e56..cb839559fa 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -127,7 +127,7 @@ public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<i
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse41.IsSupported)
             {
-                int diffSum = TTransformSse41(a, b, w, scratch);
+                int diffSum = TTransformSse41(a, b, w);
                 return Math.Abs(diffSum) >> 5;
             }
             else
@@ -654,7 +654,7 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
         /// Returns the weighted sum of the absolute value of transformed coefficients.
         /// w[] contains a row-major 4 by 4 symmetric matrix.
         /// </summary>
-        public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
+        public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
         {
             // Load and combine inputs.
             Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
index 7192fa2d05..6e724e4758 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
@@ -49,7 +49,6 @@ public void CollectHistogram(Span<byte> reference, Span<byte> pred, int startBlo
             this.distribution.AsSpan().Clear();
             for (j = startBlock; j < endBlock; j++)
             {
-                this.output.AsSpan().Clear();
                 this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);
 
                 // Convert coefficients to bin.

From 5630b25733e98b004b6a0bfe8996cbac47b6c304 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 21:58:52 +0100
Subject: [PATCH 81/85] Remove more unnecessary Clear's

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs    | 4 ----
 src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 97ef27d259..d0baa260cc 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -340,8 +340,6 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
             Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
             Span<short> tmp = it.Scratch2.AsSpan(0, 16);
             Span<int> scratch = it.Scratch3.AsSpan(0, 16);
-            tmp.Clear();
-            scratch.Clear();
             Vp8Encoding.FTransform(src, reference, tmp, scratch);
             int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
             Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
@@ -357,8 +355,6 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
             int n;
             Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16);
             Span<int> scratch = it.Scratch3.AsSpan(0, 16);
-            tmp.Clear();
-            scratch.Clear();
 
             for (n = 0; n < 8; n += 2)
             {
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 0567a0f27d..af7e8eaa36 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -81,7 +81,6 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
         {
             int i;
             Span<int> tmp = scratch.Slice(0, 16);
-            tmp.Clear();
             for (i = 0; i < 4; i++)
             {
                 // vertical pass.
@@ -124,7 +123,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
         {
             int i;
             Span<int> tmp = scratch.Slice(0, 16);
-            tmp.Clear();
 
             int srcIdx = 0;
             int refIdx = 0;
@@ -163,7 +161,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
         public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
         {
             Span<int> tmp = scratch.Slice(0, 16);
-            tmp.Clear();
 
             int i;
             int inputIdx = 0;

From 7e20c5daaadefdd3c1073088bc74f1adf0d3436b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Wed, 10 Nov 2021 12:10:46 +0100
Subject: [PATCH 82/85] Rename Vp8Sse methods

---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs     | 13 ++++++++-----
 .../Formats/WebP/LossyUtilsTests.cs                 |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index d019b5cd54..a10ec6eabb 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -19,14 +19,17 @@ internal static class LossyUtils
         private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
 #endif
 
+        // Note: method name in libwebp reference implementation is called VP8SSE16x16.
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
+        public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);
 
+        // Note: method name in libwebp reference implementation is called VP8SSE16x8.
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);
+        public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);
 
+        // Note: method name in libwebp reference implementation is called VP8SSE4x4.
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
+        public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
@@ -67,12 +70,12 @@ public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
             else
 #endif
             {
-                return GetSse(a, b, 4, 4);
+                return Vp8_SseNxN(a, b, 4, 4);
             }
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)
+        public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
         {
             int count = 0;
             int aOffset = 0;
diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
index 9d7545c321..d176a5933d 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -35,7 +35,7 @@ private static void RunVp8Sse4X4Test()
 
             int expected = 27;
 
-            int actual = LossyUtils.Vp8Sse4X4(a, b);
+            int actual = LossyUtils.Vp8_Sse4X4(a, b);
 
             Assert.Equal(expected, actual);
 		}

From 1997d595e7d496c031e861b8f094a3ba05f94fd0 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Wed, 10 Nov 2021 12:14:08 +0100
Subject: [PATCH 83/85] Fix build error due to renaming

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index d0baa260cc..38ed80590d 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -66,7 +66,7 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
                 rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode);
 
                 // Measure RD-score.
-                rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst);
+                rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst);
                 rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
                 rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
                 rdCur.R = it.GetCostLuma16(rdCur, proba, res);
@@ -160,7 +160,7 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
                     rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);
 
                     // Compute RD-score.
-                    rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst);
+                    rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst);
                     rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
                     rdTmp.H = modeCosts[mode];
 
@@ -251,7 +251,7 @@ public static void PickBestUv(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Segment
                 rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);
 
                 // Compute RD-score
-                rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst);
+                rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst);
                 rdUv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
                 rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
                 rdUv.R = it.GetCostUv(rdUv, proba, res);
@@ -407,7 +407,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
                 for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
                 {
                     Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]);
-                    long score = (LossyUtils.Vp8Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
+                    long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
 
                     if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit)
                     {
@@ -454,7 +454,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
                     for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
                     {
                         Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
-                        long score = (LossyUtils.Vp8Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
+                        long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
                         if (score < bestI4Score)
                         {
                             bestI4Mode = mode;
@@ -503,7 +503,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
                 for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
                 {
                     Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]);
-                    long score = (LossyUtils.Vp8Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
+                    long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
                     if (score < bestUvScore)
                     {
                         bestMode = mode;

From 55040a094b97a2941a6de5452b93d407a1af7f89 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 11 Nov 2021 12:52:07 +1100
Subject: [PATCH 84/85] Update codcov and config

---
 ImageSharp.sln        |  5 +++--
 codecov.yml           | 11 +++++++++++
 shared-infrastructure |  2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/ImageSharp.sln b/ImageSharp.sln
index c188d93150..f16f98ac59 100644
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@@ -1,7 +1,7 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 16
-VisualStudioVersion = 16.0.28902.138
+# Visual Studio Version 17
+VisualStudioVersion = 17.0.31903.59
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1-D75E-4C6D-83EB-80367343E0D7}"
 	ProjectSection(SolutionItems) = preProject
@@ -13,6 +13,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1
 		ci-build.ps1 = ci-build.ps1
 		ci-pack.ps1 = ci-pack.ps1
 		ci-test.ps1 = ci-test.ps1
+		codecov.yml = codecov.yml
 		Directory.Build.props = Directory.Build.props
 		Directory.Build.targets = Directory.Build.targets
 		LICENSE = LICENSE
diff --git a/codecov.yml b/codecov.yml
index 833fc0a51a..310eefb8c2 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -9,3 +9,14 @@ codecov:
     # Avoid Report Expired
     # https://docs.codecov.io/docs/codecov-yaml#section-expired-reports
     max_report_age: off
+
+coverage:
+  # Use integer precision
+  # https://docs.codecov.com/docs/codecovyml-reference#coverageprecision
+  precision: 0
+
+  # Explicitly control coverage status checks
+  # https://docs.codecov.com/docs/commit-status#disabling-a-status
+  status:
+    project: on
+    patch: off
diff --git a/shared-infrastructure b/shared-infrastructure
index a042aba176..ac1f5ee0ca 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit a042aba176cdb840d800c6ed4cfe41a54fb7b1e3
+Subproject commit ac1f5ee0ca70c070ecdda8771198a052623ac247

From 3ac8b2b713f97d2840e022dd4eebfc5ba2738cf9 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 11 Nov 2021 13:22:12 +1100
Subject: [PATCH 85/85] Use shared config

---
 shared-infrastructure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared-infrastructure b/shared-infrastructure
index ac1f5ee0ca..33cb12ca77 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit ac1f5ee0ca70c070ecdda8771198a052623ac247
+Subproject commit 33cb12ca77f919b44de56f344d2627cc2a108c3a