From cd1b77a88f3418ac5a3f6edcc5e0572746e56438 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 14 Aug 2024 23:19:01 +1000
Subject: [PATCH 1/8] Reimplement @Sergio0694 work.

---
 src/ImageSharp/Common/Helpers/Numerics.cs     |  47 ++++++++
 .../Common/Helpers/Vector128Utilities.cs      |  38 +++++++
 .../Common/Helpers/Vector256Utilities.cs      |  38 +++++++
 .../Transforms/Resize/ResizeKernel.cs         | 107 +++++++++---------
 .../ResizeKernelMap.PeriodicKernelMap.cs      |   2 +-
 .../Transforms/Resize/ResizeKernelMap.cs      |  74 ++++++------
 ...ResizeKernelMapTests.ReferenceKernelMap.cs |  36 +++---
 .../Transforms/ResizeKernelMapTests.cs        |  26 ++++-
 8 files changed, 257 insertions(+), 111 deletions(-)
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index ca14ae4c38..ced2be2e0c 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -1097,4 +1097,51 @@ public static nuint Vector512Count<TVector>(this Span<float> span)
     public static nuint Vector512Count<TVector>(int length)
         where TVector : struct
         => (uint)length / (uint)Vector512<TVector>.Count;
+
+    /// <summary>
+    /// Normalizes the values in a given <see cref="Span{T}"/>.
+    /// </summary>
+    /// <param name="span">The sequence of <see cref="float"/> values to normalize.</param>
+    /// <param name="sum">The sum of the values in <paramref name="span"/>.</param>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void Normalize(Span<float> span, float sum)
+    {
+        if (Vector256.IsHardwareAccelerated)
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
+            Vector256<float> sum256 = Vector256.Create(sum);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                Unsafe.As<float, Vector256<float>>(ref startRef) /= sum256;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)8);
+            }
+
+            if ((span.Length & 7) >= 4)
+            {
+                Unsafe.As<float, Vector128<float>>(ref startRef) /= sum256.GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)4);
+            }
+
+            endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+        else
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+    }
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index b6dd319f06..009c6e9581 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -245,6 +245,44 @@ public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128
         return default;
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector128{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector128{float}, Vector128{float}, Vector128{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<float> MultiplyAdd(Vector128<float> a, Vector128<float> b, Vector128<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 6e8c0d1de4..754d6dcb8b 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -110,6 +110,44 @@ public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
         return Vector256.ConvertToInt32(val_2p23_f32 | sign);
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector256{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplyAdd(Vector256<float> a, Vector256<float> b, Vector256<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 51a739d35e..3545bae3f7 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -5,7 +5,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
 
@@ -14,6 +14,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
 /// </summary>
 internal readonly unsafe struct ResizeKernel
 {
+    /// <summary>
+    /// The buffer with the convolution factors.
+    /// Note that when FMA is supported, this is of size 4x that reported in <see cref="Length"/>.
+    /// </summary>
     private readonly float* bufferPtr;
 
     /// <summary>
@@ -53,7 +57,15 @@ public int Length
     public Span<float> Values
     {
         [MethodImpl(InliningOptions.ShortMethod)]
-        get => new(this.bufferPtr, this.Length);
+        get
+        {
+            if (Vector256.IsHardwareAccelerated)
+            {
+                return new(this.bufferPtr, this.Length * 4);
+            }
+
+            return new(this.bufferPtr, this.Length);
+        }
     }
 
     /// <summary>
@@ -68,70 +80,42 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
     [MethodImpl(InliningOptions.ShortMethod)]
     public Vector4 ConvolveCore(ref Vector4 rowStartRef)
     {
-        if (Avx2.IsSupported && Fma.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
             float* bufferStart = this.bufferPtr;
-            float* bufferEnd = bufferStart + (this.Length & ~3);
+            ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
             Vector256<float> result256_0 = Vector256<float>.Zero;
             Vector256<float> result256_1 = Vector256<float>.Zero;
-            ReadOnlySpan<byte> maskBytes = new byte[]
-            {
-                0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0,
-                1, 0, 0, 0, 1, 0, 0, 0,
-                1, 0, 0, 0, 1, 0, 0, 0,
-            };
-            Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
 
-            while (bufferStart < bufferEnd)
+            while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
             {
-                // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
-                // for the FMA operation, and execute it directly on the target register and reading directly from
-                // memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
-                // The code below should compile in the following assembly on .NET 5 x64:
-                //
-                // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
-                // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
-                // vfmadd231ps ymm0, ymm2, [r8]     ; result256_0 = FMA(pixels, factors) + result256_0
-                //
-                // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
-                // Additionally, we're also unrolling two computations per each loop iterations to leverage the
-                // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
-                result256_0 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                    result256_0);
-
-                result256_1 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
-                    result256_1);
-
-                bufferStart += 4;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
+                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
+
+                result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
+                result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
+
+                bufferStart += 16;
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
             }
 
-            result256_0 = Avx.Add(result256_0, result256_1);
+            result256_0 += result256_1;
 
             if ((this.Length & 3) >= 2)
             {
-                result256_0 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                    result256_0);
+                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
 
-                bufferStart += 2;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
+                bufferStart += 8;
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
             }
 
-            Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
+            Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();
 
             if ((this.Length & 1) != 0)
             {
-                result128 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
-                    Vector128.Create(*bufferStart),
-                    result128);
+                Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128);
             }
 
             return *(Vector4*)&result128;
@@ -149,7 +133,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                 result += rowStartRef * *bufferStart;
 
                 bufferStart++;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1);
             }
 
             return result;
@@ -164,13 +148,30 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
     internal ResizeKernel AlterLeftValue(int left)
         => new(left, this.bufferPtr, this.Length);
 
-    internal void Fill(Span<double> values)
+    internal void FillOrCopyAndExpand(Span<float> values)
     {
         DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!");
 
-        for (int i = 0; i < this.Length; i++)
+        if (Vector256.IsHardwareAccelerated)
         {
-            this.Values[i] = (float)values[i];
+            Vector4* bufferStart = (Vector4*)this.bufferPtr;
+            ref float valuesStart = ref MemoryMarshal.GetReference(values);
+            ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length);
+
+            while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd))
+            {
+                *bufferStart = new Vector4(valuesStart);
+
+                bufferStart++;
+                valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < this.Length; i++)
+            {
+                this.Values[i] = (float)values[i];
+            }
         }
     }
 }
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs
index ee1ada43ad..b39f6de2a5 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs
@@ -54,7 +54,7 @@ protected internal override void Initialize<TResampler>(in TResampler sampler)
             int bottomStartDest = this.DestinationLength - this.cornerInterval;
             for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++)
             {
-                double center = ((i + .5) * this.ratio) - .5;
+                float center = (float)(((i + .5) * this.ratio) - .5);
                 int left = (int)TolerantMath.Ceiling(center - this.radius);
                 ResizeKernel kernel = this.kernels[i - this.period];
                 this.kernels[i] = kernel.AlterLeftValue(left);
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
index c1907bb520..cf74de1fcd 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
@@ -5,6 +5,7 @@
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
 using SixLabors.ImageSharp.Memory;
 
 namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
@@ -33,7 +34,7 @@ internal partial class ResizeKernelMap : IDisposable
     private bool isDisposed;
 
     // To avoid both GC allocations, and MemoryAllocator ceremony:
-    private readonly double[] tempValues;
+    private readonly float[] tempValues;
 
     private ResizeKernelMap(
         MemoryAllocator memoryAllocator,
@@ -50,10 +51,19 @@ private ResizeKernelMap(
         this.sourceLength = sourceLength;
         this.DestinationLength = destinationLength;
         this.MaxDiameter = (radius * 2) + 1;
-        this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true, AllocationOptions.Clean);
+
+        if (Vector256.IsHardwareAccelerated)
+        {
+            this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true);
+        }
+        else
+        {
+            this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true);
+        }
+
         this.pinHandle = this.data.DangerousGetSingleMemory().Pin();
         this.kernels = new ResizeKernel[destinationLength];
-        this.tempValues = new double[this.MaxDiameter];
+        this.tempValues = new float[this.MaxDiameter];
     }
 
     /// <summary>
@@ -155,23 +165,23 @@ public static ResizeKernelMap Calculate<TResampler>(
         bool hasAtLeast2Periods = 2 * (cornerInterval + period) < destinationSize;
 
         ResizeKernelMap result = hasAtLeast2Periods
-                                     ? new PeriodicKernelMap(
-                                         memoryAllocator,
-                                         sourceSize,
-                                         destinationSize,
-                                         ratio,
-                                         scale,
-                                         radius,
-                                         period,
-                                         cornerInterval)
-                                     : new ResizeKernelMap(
-                                         memoryAllocator,
-                                         sourceSize,
-                                         destinationSize,
-                                         destinationSize,
-                                         ratio,
-                                         scale,
-                                         radius);
+        ? new PeriodicKernelMap(
+            memoryAllocator,
+            sourceSize,
+            destinationSize,
+            ratio,
+            scale,
+            radius,
+            period,
+            cornerInterval)
+        : new ResizeKernelMap(
+            memoryAllocator,
+            sourceSize,
+            destinationSize,
+            destinationSize,
+            ratio,
+            scale,
+            radius);
 
         result.Initialize(in sampler);
 
@@ -198,7 +208,8 @@ protected internal virtual void Initialize<TResampler>(in TResampler sampler)
     private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowIndex, int dataRowIndex)
         where TResampler : struct, IResampler
     {
-        double center = ((destRowIndex + .5) * this.ratio) - .5;
+        float center = (float)(((destRowIndex + .5) * this.ratio) - .5);
+        float scale = (float)this.scale;
 
         // Keep inside bounds.
         int left = (int)TolerantMath.Ceiling(center - this.radius);
@@ -214,30 +225,25 @@ private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowI
         }
 
         ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right);
-
-        Span<double> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
-        double sum = 0;
+        Span<float> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
+        ref float kernelStart = ref MemoryMarshal.GetReference(kernelValues);
+        float sum = 0;
 
         for (int j = left; j <= right; j++)
         {
-            double value = sampler.GetValue((float)((j - center) / this.scale));
+            float value = sampler.GetValue((j - center) / scale);
             sum += value;
-
-            kernelValues[j - left] = value;
+            kernelStart = value;
+            kernelStart = ref Unsafe.Add(ref kernelStart, 1);
         }
 
         // Normalize, best to do it here rather than in the pixel loop later on.
         if (sum > 0)
         {
-            for (int j = 0; j < kernel.Length; j++)
-            {
-                // weights[w] = weights[w] / sum:
-                ref double kRef = ref kernelValues[j];
-                kRef /= sum;
-            }
+            Numerics.Normalize(kernelValues, sum);
         }
 
-        kernel.Fill(kernelValues);
+        kernel.FillOrCopyAndExpand(kernelValues);
 
         return kernel;
     }
diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs
index 290a3b37ac..72142cbdc3 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs
@@ -16,9 +16,7 @@ internal class ReferenceKernelMap
         private readonly ReferenceKernel[] kernels;
 
         public ReferenceKernelMap(ReferenceKernel[] kernels)
-        {
-            this.kernels = kernels;
-        }
+            => this.kernels = kernels;
 
         public int DestinationSize => this.kernels.Length;
 
@@ -28,22 +26,23 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
             where TResampler : struct, IResampler
         {
             double ratio = (double)sourceSize / destinationSize;
-            double scale = ratio;
+            double scaleD = ratio;
 
-            if (scale < 1F)
+            if (scaleD < 1)
             {
-                scale = 1F;
+                scaleD = 1;
             }
 
             TolerantMath tolerantMath = TolerantMath.Default;
 
-            double radius = tolerantMath.Ceiling(scale * sampler.Radius);
+            double radius = tolerantMath.Ceiling(scaleD * sampler.Radius);
 
-            var result = new List<ReferenceKernel>();
+            List<ReferenceKernel> result = [];
 
+            float scale = (float)scaleD;
             for (int i = 0; i < destinationSize; i++)
             {
-                double center = ((i + .5) * ratio) - .5;
+                float center = (float)(((i + .5) * ratio) - .5);
 
                 // Keep inside bounds.
                 int left = (int)tolerantMath.Ceiling(center - radius);
@@ -58,15 +57,14 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
                     right = sourceSize - 1;
                 }
 
-                double sum = 0;
+                float sum = 0;
 
-                double[] values = new double[right - left + 1];
+                float[] values = new float[right - left + 1];
 
                 for (int j = left; j <= right; j++)
                 {
-                    double weight = sampler.GetValue((float)((j - center) / scale));
+                    float weight = sampler.GetValue((j - center) / scale);
                     sum += weight;
-
                     values[j - left] = weight;
                 }
 
@@ -78,16 +76,14 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
                     }
                 }
 
-                float[] floatVals = values.Select(v => (float)v).ToArray();
-
-                result.Add(new ReferenceKernel(left, floatVals));
+                result.Add(new ReferenceKernel(left, values));
             }
 
-            return new ReferenceKernelMap(result.ToArray());
+            return new ReferenceKernelMap([.. result]);
         }
     }
 
-    internal struct ReferenceKernel
+    internal readonly struct ReferenceKernel
     {
         public ReferenceKernel(int left, float[] values)
         {
@@ -102,8 +98,6 @@ public ReferenceKernel(int left, float[] values)
         public int Length => this.Values.Length;
 
         public static implicit operator ReferenceKernel(ResizeKernel orig)
-        {
-            return new ReferenceKernel(orig.StartIndex, orig.Values.ToArray());
-        }
+            => new(orig.StartIndex, orig.Values.ToArray());
     }
 }
diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
index c6da46ee2f..337f8c75dc 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
@@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
+using System.Runtime.Intrinsics;
 using System.Text;
 using SixLabors.ImageSharp.Processing;
 using SixLabors.ImageSharp.Processing.Processors.Transforms;
@@ -124,7 +125,6 @@ private void VerifyKernelMapContentIsCorrect<TResampler>(TResampler resampler, i
         this.Output.WriteLine($"Expected KernelMap:\n{PrintKernelMap(referenceMap)}\n");
         this.Output.WriteLine($"Actual KernelMap:\n{PrintKernelMap(kernelMap)}\n");
 #endif
-        var comparer = new ApproximateFloatComparer(1e-6f);
 
         for (int i = 0; i < kernelMap.DestinationLength; i++)
         {
@@ -139,7 +139,29 @@ private void VerifyKernelMapContentIsCorrect<TResampler>(TResampler resampler, i
                 referenceKernel.Left == kernel.StartIndex,
                 $"referenceKernel.Left != kernel.Left: {referenceKernel.Left} != {kernel.StartIndex}");
             float[] expectedValues = referenceKernel.Values;
-            Span<float> actualValues = kernel.Values;
+            Span<float> actualValues;
+
+            ApproximateFloatComparer comparer;
+            if (Vector256.IsHardwareAccelerated)
+            {
+                comparer = new ApproximateFloatComparer(1e-4f);
+
+                Assert.Equal(expectedValues.Length, kernel.Values.Length / 4);
+
+                int actualLength = referenceKernel.Length / 4;
+
+                actualValues = new float[expectedValues.Length];
+
+                for (int j = 0; j < expectedValues.Length; j++)
+                {
+                    actualValues[j] = kernel.Values[j * 4];
+                }
+            }
+            else
+            {
+                comparer = new ApproximateFloatComparer(1e-6f);
+                actualValues = kernel.Values;
+            }
 
             Assert.Equal(expectedValues.Length, actualValues.Length);
 

From 36fefc6059980e0b00d4e949cdf64a7016c25dce Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 15 Aug 2024 15:54:28 +1000
Subject: [PATCH 2/8] Add Vector512 support

---
 src/ImageSharp/Common/Helpers/Numerics.cs     | 34 +++++++++-
 .../Common/Helpers/Vector128Utilities.cs      |  2 +-
 .../Common/Helpers/Vector256Utilities.cs      |  2 +-
 .../Common/Helpers/Vector512Utilities.cs      | 32 ++++++++++
 .../Transforms/Resize/ResizeKernel.cs         | 63 +++++++++++++++++--
 5 files changed, 124 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index ced2be2e0c..e8f50b3eeb 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -1106,7 +1106,39 @@ public static nuint Vector512Count<TVector>(int length)
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static void Normalize(Span<float> span, float sum)
     {
-        if (Vector256.IsHardwareAccelerated)
+        if (Vector512.IsHardwareAccelerated)
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15);
+            Vector512<float> sum512 = Vector512.Create(sum);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                Unsafe.As<float, Vector512<float>>(ref startRef) /= sum512;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)16);
+            }
+
+            if ((span.Length & 15) >= 8)
+            {
+                Unsafe.As<float, Vector256<float>>(ref startRef) /= sum512.GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)8);
+            }
+
+            if ((span.Length & 7) >= 4)
+            {
+                Unsafe.As<float, Vector128<float>>(ref startRef) /= sum512.GetLower().GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)4);
+            }
+
+            endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+        else if (Vector256.IsHardwareAccelerated)
         {
             ref float startRef = ref MemoryMarshal.GetReference(span);
             ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index 009c6e9581..07cfe02850 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -273,7 +273,7 @@ public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128
     /// </para>
     /// </remarks>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<float> MultiplyAdd(Vector128<float> a, Vector128<float> b, Vector128<float> c)
+    public static Vector128<float> MultiplyAddEstimate(Vector128<float> a, Vector128<float> b, Vector128<float> c)
     {
         if (Fma.IsSupported)
         {
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 754d6dcb8b..082e4683b0 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -138,7 +138,7 @@ public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
     /// </para>
     /// </remarks>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<float> MultiplyAdd(Vector256<float> a, Vector256<float> b, Vector256<float> c)
+    public static Vector256<float> MultiplyAddEstimate(Vector256<float> a, Vector256<float> b, Vector256<float> c)
     {
         if (Fma.IsSupported)
         {
diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
index 0165af90ef..3325ad1aeb 100644
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@@ -110,6 +110,38 @@ public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
         return Vector512.ConvertToInt32(val_2p23_f32 | sign);
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector512{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/> against the upper and lower
+    /// buts. This approach can result in slightly different results compared to performing the multiplication and
+    /// addition separately due to differences in how floating-point rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector512<float> MultiplyAddEstimate(Vector512<float> a, Vector512<float> b, Vector512<float> c)
+        => Vector512.Create(
+            Vector256Utilities.MultiplyAddEstimate(a.GetLower(), b.GetLower(), c.GetLower()),
+            Vector256Utilities.MultiplyAddEstimate(a.GetUpper(), b.GetUpper(), c.GetUpper()));
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 3545bae3f7..41afec892c 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -80,7 +80,58 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
     [MethodImpl(InliningOptions.ShortMethod)]
     public Vector4 ConvolveCore(ref Vector4 rowStartRef)
     {
-        if (Vector256.IsHardwareAccelerated)
+        if (Vector512.IsHardwareAccelerated)
+        {
+            float* bufferStart = this.bufferPtr;
+            ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7);
+            Vector512<float> result512_0 = Vector512<float>.Zero;
+            Vector512<float> result512_1 = Vector512<float>.Zero;
+
+            while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
+            {
+                Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
+                Vector512<float> pixels512_1 = Unsafe.As<Vector4, Vector512<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)4));
+
+                result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
+                result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1);
+
+                bufferStart += 32;
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8);
+            }
+
+            result512_0 += result512_1;
+
+            if ((this.Length & 7) >= 4)
+            {
+                Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
+                result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
+
+                bufferStart += 16;
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
+            }
+
+            Vector256<float> result256 = result512_0.GetLower() + result512_0.GetUpper();
+
+            if ((this.Length & 3) >= 2)
+            {
+                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256);
+
+                bufferStart += 8;
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
+            }
+
+            Vector128<float> result128 = result256.GetLower() + result256.GetUpper();
+
+            if ((this.Length & 1) != 0)
+            {
+                Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
+            }
+
+            return *(Vector4*)&result128;
+        }
+        else if (Vector256.IsHardwareAccelerated)
         {
             float* bufferStart = this.bufferPtr;
             ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
@@ -92,8 +143,8 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                 Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
                 Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
 
-                result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
-                result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
+                result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
+                result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
 
                 bufferStart += 16;
                 rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
@@ -104,7 +155,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
             if ((this.Length & 3) >= 2)
             {
                 Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
+                result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
 
                 bufferStart += 8;
                 rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
@@ -115,7 +166,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
             if ((this.Length & 1) != 0)
             {
                 Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
-                result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128);
+                result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
             }
 
             return *(Vector4*)&result128;
@@ -170,7 +221,7 @@ internal void FillOrCopyAndExpand(Span<float> values)
         {
             for (int i = 0; i < this.Length; i++)
             {
-                this.Values[i] = (float)values[i];
+                this.Values[i] = values[i];
             }
         }
     }

From 4728b97d85b7ad66d3d7975942ed9b99af3ad2fd Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 15 Aug 2024 16:44:41 +1000
Subject: [PATCH 3/8] Use dedicated property

---
 .../Transforms/Resize/ResizeKernel.cs         | 150 ++++++++++--------
 .../Transforms/ResizeKernelMapTests.cs        |   3 +-
 2 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 41afec892c..7a70caa3c6 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -23,6 +23,9 @@ internal readonly unsafe struct ResizeKernel
     /// <summary>
     /// Initializes a new instance of the <see cref="ResizeKernel"/> struct.
     /// </summary>
+    /// <param name="startIndex">The starting index for the destination row.</param>
+    /// <param name="bufferPtr">The pointer to the buffer with the convolution factors.</param>
+    /// <param name="length">The length of the kernel.</param>
     [MethodImpl(InliningOptions.ShortMethod)]
     internal ResizeKernel(int startIndex, float* bufferPtr, int length)
     {
@@ -31,6 +34,15 @@ internal ResizeKernel(int startIndex, float* bufferPtr, int length)
         this.Length = length;
     }
 
+    /// <summary>
+    /// Gets a value indicating whether vectorization is supported.
+    /// </summary>
+    public static bool SupportsVectorization
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        get => Vector256.IsHardwareAccelerated;
+    }
+
     /// <summary>
     /// Gets the start index for the destination row.
     /// </summary>
@@ -80,96 +92,99 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
     [MethodImpl(InliningOptions.ShortMethod)]
     public Vector4 ConvolveCore(ref Vector4 rowStartRef)
     {
-        if (Vector512.IsHardwareAccelerated)
+        if (SupportsVectorization)
         {
-            float* bufferStart = this.bufferPtr;
-            ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7);
-            Vector512<float> result512_0 = Vector512<float>.Zero;
-            Vector512<float> result512_1 = Vector512<float>.Zero;
-
-            while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
+            if (Vector512.IsHardwareAccelerated)
             {
-                Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
-                Vector512<float> pixels512_1 = Unsafe.As<Vector4, Vector512<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)4));
+                float* bufferStart = this.bufferPtr;
+                ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7);
+                Vector512<float> result512_0 = Vector512<float>.Zero;
+                Vector512<float> result512_1 = Vector512<float>.Zero;
 
-                result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
-                result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1);
+                while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
+                {
+                    Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
+                    Vector512<float> pixels512_1 = Unsafe.As<Vector4, Vector512<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)4));
 
-                bufferStart += 32;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8);
-            }
+                    result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
+                    result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1);
 
-            result512_0 += result512_1;
+                    bufferStart += 32;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8);
+                }
 
-            if ((this.Length & 7) >= 4)
-            {
-                Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
-                result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
+                result512_0 += result512_1;
 
-                bufferStart += 16;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
-            }
+                if ((this.Length & 7) >= 4)
+                {
+                    Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
+                    result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
 
-            Vector256<float> result256 = result512_0.GetLower() + result512_0.GetUpper();
+                    bufferStart += 16;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
+                }
 
-            if ((this.Length & 3) >= 2)
-            {
-                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256);
+                Vector256<float> result256 = result512_0.GetLower() + result512_0.GetUpper();
 
-                bufferStart += 8;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
-            }
+                if ((this.Length & 3) >= 2)
+                {
+                    Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256);
 
-            Vector128<float> result128 = result256.GetLower() + result256.GetUpper();
+                    bufferStart += 8;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
+                }
 
-            if ((this.Length & 1) != 0)
-            {
-                Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
-                result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
-            }
+                Vector128<float> result128 = result256.GetLower() + result256.GetUpper();
 
-            return *(Vector4*)&result128;
-        }
-        else if (Vector256.IsHardwareAccelerated)
-        {
-            float* bufferStart = this.bufferPtr;
-            ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
-            Vector256<float> result256_0 = Vector256<float>.Zero;
-            Vector256<float> result256_1 = Vector256<float>.Zero;
+                if ((this.Length & 1) != 0)
+                {
+                    Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                    result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
+                }
 
-            while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
+                return *(Vector4*)&result128;
+            }
+            else
             {
-                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
+                float* bufferStart = this.bufferPtr;
+                ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
+                Vector256<float> result256_0 = Vector256<float>.Zero;
+                Vector256<float> result256_1 = Vector256<float>.Zero;
 
-                result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
-                result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
+                while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
+                {
+                    Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
 
-                bufferStart += 16;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
-            }
+                    result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
+                    result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
 
-            result256_0 += result256_1;
+                    bufferStart += 16;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
+                }
 
-            if ((this.Length & 3) >= 2)
-            {
-                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
+                result256_0 += result256_1;
 
-                bufferStart += 8;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
-            }
+                if ((this.Length & 3) >= 2)
+                {
+                    Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
 
-            Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();
+                    bufferStart += 8;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
+                }
 
-            if ((this.Length & 1) != 0)
-            {
-                Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
-                result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
-            }
+                Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();
+
+                if ((this.Length & 1) != 0)
+                {
+                    Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                    result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
+                }
 
-            return *(Vector4*)&result128;
+                return *(Vector4*)&result128;
+            }
         }
         else
         {
@@ -195,6 +210,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
     /// Copy the contents of <see cref="ResizeKernel"/> altering <see cref="StartIndex"/>
     /// to the value <paramref name="left"/>.
     /// </summary>
+    /// <param name="left">The new value for <see cref="StartIndex"/>.</param>
     [MethodImpl(InliningOptions.ShortMethod)]
     internal ResizeKernel AlterLeftValue(int left)
         => new(left, this.bufferPtr, this.Length);
diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
index 337f8c75dc..6d0de65c42 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
@@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
-using System.Runtime.Intrinsics;
 using System.Text;
 using SixLabors.ImageSharp.Processing;
 using SixLabors.ImageSharp.Processing.Processors.Transforms;
@@ -142,7 +141,7 @@ private void VerifyKernelMapContentIsCorrect<TResampler>(TResampler resampler, i
             Span<float> actualValues;
 
             ApproximateFloatComparer comparer;
-            if (Vector256.IsHardwareAccelerated)
+            if (ResizeKernel.SupportsVectorization)
             {
                 comparer = new ApproximateFloatComparer(1e-4f);
 

From 8c19a979eab6ddb9f1bd10400ab6aaca3aec561d Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 15 Aug 2024 17:24:14 +1000
Subject: [PATCH 4/8] Update ResizeKernelMap.cs

---
 .../Processing/Processors/Transforms/Resize/ResizeKernelMap.cs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
index cf74de1fcd..b52054d553 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
@@ -5,7 +5,6 @@
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
 using SixLabors.ImageSharp.Memory;
 
 namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
@@ -52,7 +51,7 @@ private ResizeKernelMap(
         this.DestinationLength = destinationLength;
         this.MaxDiameter = (radius * 2) + 1;
 
-        if (Vector256.IsHardwareAccelerated)
+        if (ResizeKernel.SupportsVectorization)
         {
             this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true);
         }

From 7840665f04a5c1f5cc726082947de973202061c4 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 15 Aug 2024 20:41:42 +1000
Subject: [PATCH 5/8] Don't use FMA for 512

---
 src/ImageSharp/Common/Helpers/Vector512Utilities.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
index 3325ad1aeb..8a9ba6aa44 100644
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@@ -138,9 +138,10 @@ public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
     /// </remarks>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector512<float> MultiplyAddEstimate(Vector512<float> a, Vector512<float> b, Vector512<float> c)
-        => Vector512.Create(
-            Vector256Utilities.MultiplyAddEstimate(a.GetLower(), b.GetLower(), c.GetLower()),
-            Vector256Utilities.MultiplyAddEstimate(a.GetUpper(), b.GetUpper(), c.GetUpper()));
+
+        // Don't actually use FMA as it requires many more instruction to extract the
+        // upper and lower parts of the vector and then recombine them.
+        => (a + b) * c;
 
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();

From 58f6afb3cd7edc3ab0763af6ad3c767caa7499be Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 22 Aug 2024 21:20:07 +1000
Subject: [PATCH 6/8] Update
 src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

Co-authored-by: Clinton Ingram <clinton.ingram@outlook.com>
---
 .../Processing/Processors/Transforms/Resize/ResizeKernel.cs  | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 7a70caa3c6..6c1f7217a7 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -235,10 +235,7 @@ internal void FillOrCopyAndExpand(Span<float> values)
         }
         else
         {
-            for (int i = 0; i < this.Length; i++)
-            {
-                this.Values[i] = values[i];
-            }
+            values.CopyTo(this.Values);
         }
     }
 }

From 0594035f945a14835daee6591550b315651a1ded Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 22 Aug 2024 21:20:17 +1000
Subject: [PATCH 7/8] Update src/ImageSharp/Common/Helpers/Numerics.cs

Co-authored-by: Clinton Ingram <clinton.ingram@outlook.com>
---
 src/ImageSharp/Common/Helpers/Numerics.cs | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index e8f50b3eeb..621229ab0b 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -1166,13 +1166,9 @@ public static void Normalize(Span<float> span, float sum)
         }
         else
         {
-            ref float startRef = ref MemoryMarshal.GetReference(span);
-            ref float endRef = ref Unsafe.Add(ref startRef, span.Length);
-
-            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            for (int i = 0; i < span.Length; i++)
             {
-                startRef /= sum;
-                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+                span[i] /= sum;
             }
         }
     }

From 72813ee5714dd6cc5731e5104615825c563932d6 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 22 Aug 2024 22:24:07 +1000
Subject: [PATCH 8/8] use Avx512F.FusedMultiplyAdd

---
 src/ImageSharp/Common/Helpers/Vector512Utilities.cs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
index 8a9ba6aa44..bcc3c9fa92 100644
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@@ -3,6 +3,7 @@
 
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -138,10 +139,14 @@ public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
     /// </remarks>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector512<float> MultiplyAddEstimate(Vector512<float> a, Vector512<float> b, Vector512<float> c)
+    {
+        if (Avx512F.IsSupported)
+        {
+            return Avx512F.FusedMultiplyAdd(a, b, c);
+        }
 
-        // Don't actually use FMA as it requires many more instruction to extract the
-        // upper and lower parts of the vector and then recombine them.
-        => (a + b) * c;
+        return (a + b) * c;
+    }
 
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();