From cd1b77a88f3418ac5a3f6edcc5e0572746e56438 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 14 Aug 2024 23:19:01 +1000 Subject: [PATCH 1/8] Reimplement @Sergio0694 work. --- src/ImageSharp/Common/Helpers/Numerics.cs | 47 ++++++++ .../Common/Helpers/Vector128Utilities.cs | 38 +++++++ .../Common/Helpers/Vector256Utilities.cs | 38 +++++++ .../Transforms/Resize/ResizeKernel.cs | 107 +++++++++--------- .../ResizeKernelMap.PeriodicKernelMap.cs | 2 +- .../Transforms/Resize/ResizeKernelMap.cs | 74 ++++++------ ...ResizeKernelMapTests.ReferenceKernelMap.cs | 36 +++--- .../Transforms/ResizeKernelMapTests.cs | 26 ++++- 8 files changed, 257 insertions(+), 111 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ca14ae4c38..ced2be2e0c 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -1097,4 +1097,51 @@ public static nuint Vector512Count(this Span span) public static nuint Vector512Count(int length) where TVector : struct => (uint)length / (uint)Vector512.Count; + + /// + /// Normalizes the values in a given . + /// + /// The sequence of values to normalize. + /// The sum of the values in . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Normalize(Span span, float sum) + { + if (Vector256.IsHardwareAccelerated) + { + ref float startRef = ref MemoryMarshal.GetReference(span); + ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7); + Vector256 sum256 = Vector256.Create(sum); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + Unsafe.As>(ref startRef) /= sum256; + startRef = ref Unsafe.Add(ref startRef, (nuint)8); + } + + if ((span.Length & 7) >= 4) + { + Unsafe.As>(ref startRef) /= sum256.GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)4); + } + + endRef = ref Unsafe.Add(ref startRef, span.Length & 3); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + startRef /= sum; + startRef = ref Unsafe.Add(ref startRef, (nuint)1); + } + } + else + { + ref float startRef = ref MemoryMarshal.GetReference(span); + ref float endRef = ref Unsafe.Add(ref startRef, span.Length); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + startRef /= sum; + startRef = ref Unsafe.Add(ref startRef, (nuint)1); + } + } + } } diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index b6dd319f06..009c6e9581 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -245,6 +245,44 @@ public static Vector128 PackSignedSaturate(Vector128 left, Vector128 return default; } + /// + /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the + /// product of corresponding elements in and added to the + /// corresponding element in . + /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single + /// fused operation for better performance and precision. + /// + /// The first vector of single-precision floating-point numbers to be multiplied. + /// The second vector of single-precision floating-point numbers to be multiplied. + /// The vector of single-precision floating-point numbers to be added to the product of + /// and . + /// + /// A where each element is the result of multiplying the corresponding elements + /// of and , and then adding the corresponding element from . + /// + /// + /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using + /// . This approach can result + /// in slightly different results compared to performing the multiplication and addition separately due to + /// differences in how floating-point + /// rounding is handled. + /// + /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead + /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy + /// is critical. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyAdd(Vector128 a, Vector128 b, Vector128 c) + { + if (Fma.IsSupported) + { + return Fma.MultiplyAdd(a, b, c); + } + + return (a * b) + c; + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 6e8c0d1de4..754d6dcb8b 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -110,6 +110,44 @@ public static Vector256 ConvertToInt32RoundToEven(Vector256 vector) return Vector256.ConvertToInt32(val_2p23_f32 | sign); } + /// + /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the + /// product of corresponding elements in and added to the + /// corresponding element in . + /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single + /// fused operation for better performance and precision. + /// + /// The first vector of single-precision floating-point numbers to be multiplied. + /// The second vector of single-precision floating-point numbers to be multiplied. + /// The vector of single-precision floating-point numbers to be added to the product of + /// and . + /// + /// A where each element is the result of multiplying the corresponding elements + /// of and , and then adding the corresponding element from . + /// + /// + /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using + /// . This approach can result + /// in slightly different results compared to performing the multiplication and addition separately due to + /// differences in how floating-point + /// rounding is handled. + /// + /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead + /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy + /// is critical. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyAdd(Vector256 a, Vector256 b, Vector256 c) + { + if (Fma.IsSupported) + { + return Fma.MultiplyAdd(a, b, c); + } + + return (a * b) + c; + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 51a739d35e..3545bae3f7 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -5,7 +5,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Processing.Processors.Transforms; @@ -14,6 +14,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms; /// internal readonly unsafe struct ResizeKernel { + /// + /// The buffer with the convolution factors. + /// Note that when FMA is supported, this is of size 4x that reported in . + /// private readonly float* bufferPtr; /// @@ -53,7 +57,15 @@ public int Length public Span Values { [MethodImpl(InliningOptions.ShortMethod)] - get => new(this.bufferPtr, this.Length); + get + { + if (Vector256.IsHardwareAccelerated) + { + return new(this.bufferPtr, this.Length * 4); + } + + return new(this.bufferPtr, this.Length); + } } /// @@ -68,70 +80,42 @@ public Vector4 Convolve(Span rowSpan) [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - if (Avx2.IsSupported && Fma.IsSupported) + if (Vector256.IsHardwareAccelerated) { float* bufferStart = this.bufferPtr; - float* bufferEnd = bufferStart + (this.Length & ~3); + ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3); Vector256 result256_0 = Vector256.Zero; Vector256 result256_1 = Vector256.Zero; - ReadOnlySpan maskBytes = new byte[] - { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, - }; - Vector256 mask = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(maskBytes)); - while (bufferStart < bufferEnd) + while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) { - // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps - // for the FMA operation, and execute it directly on the target register and reading directly from - // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. - // The code below should compile in the following assembly on .NET 5 x64: - // - // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] - // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] - // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 - // - // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. - // Additionally, we're also unrolling two computations per each loop iterations to leverage the - // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. - result256_0 = Fma.MultiplyAdd( - Unsafe.As>(ref rowStartRef), - Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), - result256_0); - - result256_1 = Fma.MultiplyAdd( - Unsafe.As>(ref Unsafe.Add(ref rowStartRef, 2)), - Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), - result256_1); - - bufferStart += 4; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + Vector256 pixels256_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)2)); + + result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); + + bufferStart += 16; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); } - result256_0 = Avx.Add(result256_0, result256_1); + result256_0 += result256_1; if ((this.Length & 3) >= 2) { - result256_0 = Fma.MultiplyAdd( - Unsafe.As>(ref rowStartRef), - Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), - result256_0); + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0); - bufferStart += 2; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + bufferStart += 8; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); } - Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); + Vector128 result128 = result256_0.GetLower() + result256_0.GetUpper(); if ((this.Length & 1) != 0) { - result128 = Fma.MultiplyAdd( - Unsafe.As>(ref rowStartRef), - Vector128.Create(*bufferStart), - result128); + Vector128 pixels128 = Unsafe.As>(ref rowStartRef); + result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128); } return *(Vector4*)&result128; @@ -149,7 +133,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) result += rowStartRef * *bufferStart; bufferStart++; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1); } return result; @@ -164,13 +148,30 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) internal ResizeKernel AlterLeftValue(int left) => new(left, this.bufferPtr, this.Length); - internal void Fill(Span values) + internal void FillOrCopyAndExpand(Span values) { DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!"); - for (int i = 0; i < this.Length; i++) + if (Vector256.IsHardwareAccelerated) { - this.Values[i] = (float)values[i]; + Vector4* bufferStart = (Vector4*)this.bufferPtr; + ref float valuesStart = ref MemoryMarshal.GetReference(values); + ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length); + + while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd)) + { + *bufferStart = new Vector4(valuesStart); + + bufferStart++; + valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1); + } + } + else + { + for (int i = 0; i < this.Length; i++) + { + this.Values[i] = (float)values[i]; + } } } } diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs index ee1ada43ad..b39f6de2a5 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs @@ -54,7 +54,7 @@ protected internal override void Initialize(in TResampler sampler) int bottomStartDest = this.DestinationLength - this.cornerInterval; for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++) { - double center = ((i + .5) * this.ratio) - .5; + float center = (float)(((i + .5) * this.ratio) - .5); int left = (int)TolerantMath.Ceiling(center - this.radius); ResizeKernel kernel = this.kernels[i - this.period]; this.kernels[i] = kernel.AlterLeftValue(left); diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs index c1907bb520..cf74de1fcd 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Processing.Processors.Transforms; @@ -33,7 +34,7 @@ internal partial class ResizeKernelMap : IDisposable private bool isDisposed; // To avoid both GC allocations, and MemoryAllocator ceremony: - private readonly double[] tempValues; + private readonly float[] tempValues; private ResizeKernelMap( MemoryAllocator memoryAllocator, @@ -50,10 +51,19 @@ private ResizeKernelMap( this.sourceLength = sourceLength; this.DestinationLength = destinationLength; this.MaxDiameter = (radius * 2) + 1; - this.data = memoryAllocator.Allocate2D(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true, AllocationOptions.Clean); + + if (Vector256.IsHardwareAccelerated) + { + this.data = memoryAllocator.Allocate2D(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true); + } + else + { + this.data = memoryAllocator.Allocate2D(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true); + } + this.pinHandle = this.data.DangerousGetSingleMemory().Pin(); this.kernels = new ResizeKernel[destinationLength]; - this.tempValues = new double[this.MaxDiameter]; + this.tempValues = new float[this.MaxDiameter]; } /// @@ -155,23 +165,23 @@ public static ResizeKernelMap Calculate( bool hasAtLeast2Periods = 2 * (cornerInterval + period) < destinationSize; ResizeKernelMap result = hasAtLeast2Periods - ? new PeriodicKernelMap( - memoryAllocator, - sourceSize, - destinationSize, - ratio, - scale, - radius, - period, - cornerInterval) - : new ResizeKernelMap( - memoryAllocator, - sourceSize, - destinationSize, - destinationSize, - ratio, - scale, - radius); + ? new PeriodicKernelMap( + memoryAllocator, + sourceSize, + destinationSize, + ratio, + scale, + radius, + period, + cornerInterval) + : new ResizeKernelMap( + memoryAllocator, + sourceSize, + destinationSize, + destinationSize, + ratio, + scale, + radius); result.Initialize(in sampler); @@ -198,7 +208,8 @@ protected internal virtual void Initialize(in TResampler sampler) private ResizeKernel BuildKernel(in TResampler sampler, int destRowIndex, int dataRowIndex) where TResampler : struct, IResampler { - double center = ((destRowIndex + .5) * this.ratio) - .5; + float center = (float)(((destRowIndex + .5) * this.ratio) - .5); + float scale = (float)this.scale; // Keep inside bounds. int left = (int)TolerantMath.Ceiling(center - this.radius); @@ -214,30 +225,25 @@ private ResizeKernel BuildKernel(in TResampler sampler, int destRowI } ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right); - - Span kernelValues = this.tempValues.AsSpan(0, kernel.Length); - double sum = 0; + Span kernelValues = this.tempValues.AsSpan(0, kernel.Length); + ref float kernelStart = ref MemoryMarshal.GetReference(kernelValues); + float sum = 0; for (int j = left; j <= right; j++) { - double value = sampler.GetValue((float)((j - center) / this.scale)); + float value = sampler.GetValue((j - center) / scale); sum += value; - - kernelValues[j - left] = value; + kernelStart = value; + kernelStart = ref Unsafe.Add(ref kernelStart, 1); } // Normalize, best to do it here rather than in the pixel loop later on. if (sum > 0) { - for (int j = 0; j < kernel.Length; j++) - { - // weights[w] = weights[w] / sum: - ref double kRef = ref kernelValues[j]; - kRef /= sum; - } + Numerics.Normalize(kernelValues, sum); } - kernel.Fill(kernelValues); + kernel.FillOrCopyAndExpand(kernelValues); return kernel; } diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs index 290a3b37ac..72142cbdc3 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs @@ -16,9 +16,7 @@ internal class ReferenceKernelMap private readonly ReferenceKernel[] kernels; public ReferenceKernelMap(ReferenceKernel[] kernels) - { - this.kernels = kernels; - } + => this.kernels = kernels; public int DestinationSize => this.kernels.Length; @@ -28,22 +26,23 @@ public static ReferenceKernelMap Calculate(in TResampler sampler, in where TResampler : struct, IResampler { double ratio = (double)sourceSize / destinationSize; - double scale = ratio; + double scaleD = ratio; - if (scale < 1F) + if (scaleD < 1) { - scale = 1F; + scaleD = 1; } TolerantMath tolerantMath = TolerantMath.Default; - double radius = tolerantMath.Ceiling(scale * sampler.Radius); + double radius = tolerantMath.Ceiling(scaleD * sampler.Radius); - var result = new List(); + List result = []; + float scale = (float)scaleD; for (int i = 0; i < destinationSize; i++) { - double center = ((i + .5) * ratio) - .5; + float center = (float)(((i + .5) * ratio) - .5); // Keep inside bounds. int left = (int)tolerantMath.Ceiling(center - radius); @@ -58,15 +57,14 @@ public static ReferenceKernelMap Calculate(in TResampler sampler, in right = sourceSize - 1; } - double sum = 0; + float sum = 0; - double[] values = new double[right - left + 1]; + float[] values = new float[right - left + 1]; for (int j = left; j <= right; j++) { - double weight = sampler.GetValue((float)((j - center) / scale)); + float weight = sampler.GetValue((j - center) / scale); sum += weight; - values[j - left] = weight; } @@ -78,16 +76,14 @@ public static ReferenceKernelMap Calculate(in TResampler sampler, in } } - float[] floatVals = values.Select(v => (float)v).ToArray(); - - result.Add(new ReferenceKernel(left, floatVals)); + result.Add(new ReferenceKernel(left, values)); } - return new ReferenceKernelMap(result.ToArray()); + return new ReferenceKernelMap([.. result]); } } - internal struct ReferenceKernel + internal readonly struct ReferenceKernel { public ReferenceKernel(int left, float[] values) { @@ -102,8 +98,6 @@ public ReferenceKernel(int left, float[] values) public int Length => this.Values.Length; public static implicit operator ReferenceKernel(ResizeKernel orig) - { - return new ReferenceKernel(orig.StartIndex, orig.Values.ToArray()); - } + => new(orig.StartIndex, orig.Values.ToArray()); } } diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs index c6da46ee2f..337f8c75dc 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. +using System.Runtime.Intrinsics; using System.Text; using SixLabors.ImageSharp.Processing; using SixLabors.ImageSharp.Processing.Processors.Transforms; @@ -124,7 +125,6 @@ private void VerifyKernelMapContentIsCorrect(TResampler resampler, i this.Output.WriteLine($"Expected KernelMap:\n{PrintKernelMap(referenceMap)}\n"); this.Output.WriteLine($"Actual KernelMap:\n{PrintKernelMap(kernelMap)}\n"); #endif - var comparer = new ApproximateFloatComparer(1e-6f); for (int i = 0; i < kernelMap.DestinationLength; i++) { @@ -139,7 +139,29 @@ private void VerifyKernelMapContentIsCorrect(TResampler resampler, i referenceKernel.Left == kernel.StartIndex, $"referenceKernel.Left != kernel.Left: {referenceKernel.Left} != {kernel.StartIndex}"); float[] expectedValues = referenceKernel.Values; - Span actualValues = kernel.Values; + Span actualValues; + + ApproximateFloatComparer comparer; + if (Vector256.IsHardwareAccelerated) + { + comparer = new ApproximateFloatComparer(1e-4f); + + Assert.Equal(expectedValues.Length, kernel.Values.Length / 4); + + int actualLength = referenceKernel.Length / 4; + + actualValues = new float[expectedValues.Length]; + + for (int j = 0; j < expectedValues.Length; j++) + { + actualValues[j] = kernel.Values[j * 4]; + } + } + else + { + comparer = new ApproximateFloatComparer(1e-6f); + actualValues = kernel.Values; + } Assert.Equal(expectedValues.Length, actualValues.Length); From 36fefc6059980e0b00d4e949cdf64a7016c25dce Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 15 Aug 2024 15:54:28 +1000 Subject: [PATCH 2/8] Add Vector512 support --- src/ImageSharp/Common/Helpers/Numerics.cs | 34 +++++++++- .../Common/Helpers/Vector128Utilities.cs | 2 +- .../Common/Helpers/Vector256Utilities.cs | 2 +- .../Common/Helpers/Vector512Utilities.cs | 32 ++++++++++ .../Transforms/Resize/ResizeKernel.cs | 63 +++++++++++++++++-- 5 files changed, 124 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ced2be2e0c..e8f50b3eeb 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -1106,7 +1106,39 @@ public static nuint Vector512Count(int length) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Normalize(Span span, float sum) { - if (Vector256.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated) + { + ref float startRef = ref MemoryMarshal.GetReference(span); + ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15); + Vector512 sum512 = Vector512.Create(sum); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + Unsafe.As>(ref startRef) /= sum512; + startRef = ref Unsafe.Add(ref startRef, (nuint)16); + } + + if ((span.Length & 15) >= 8) + { + Unsafe.As>(ref startRef) /= sum512.GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)8); + } + + if ((span.Length & 7) >= 4) + { + Unsafe.As>(ref startRef) /= sum512.GetLower().GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)4); + } + + endRef = ref Unsafe.Add(ref startRef, span.Length & 3); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + startRef /= sum; + startRef = ref Unsafe.Add(ref startRef, (nuint)1); + } + } + else if (Vector256.IsHardwareAccelerated) { ref float startRef = ref MemoryMarshal.GetReference(span); ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 009c6e9581..07cfe02850 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -273,7 +273,7 @@ public static Vector128 PackSignedSaturate(Vector128 left, Vector128 /// /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 MultiplyAdd(Vector128 a, Vector128 b, Vector128 c) + public static Vector128 MultiplyAddEstimate(Vector128 a, Vector128 b, Vector128 c) { if (Fma.IsSupported) { diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 754d6dcb8b..082e4683b0 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -138,7 +138,7 @@ public static Vector256 ConvertToInt32RoundToEven(Vector256 vector) /// /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 MultiplyAdd(Vector256 a, Vector256 b, Vector256 c) + public static Vector256 MultiplyAddEstimate(Vector256 a, Vector256 b, Vector256 c) { if (Fma.IsSupported) { diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 0165af90ef..3325ad1aeb 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -110,6 +110,38 @@ public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) return Vector512.ConvertToInt32(val_2p23_f32 | sign); } + /// + /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the + /// product of corresponding elements in and added to the + /// corresponding element in . + /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single + /// fused operation for better performance and precision. + /// + /// The first vector of single-precision floating-point numbers to be multiplied. + /// The second vector of single-precision floating-point numbers to be multiplied. + /// The vector of single-precision floating-point numbers to be added to the product of + /// and . + /// + /// A where each element is the result of multiplying the corresponding elements + /// of and , and then adding the corresponding element from . + /// + /// + /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using + /// against the upper and lower + /// buts. This approach can result in slightly different results compared to performing the multiplication and + /// addition separately due to differences in how floating-point rounding is handled. + /// + /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead + /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy + /// is critical. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector512 MultiplyAddEstimate(Vector512 a, Vector512 b, Vector512 c) + => Vector512.Create( + Vector256Utilities.MultiplyAddEstimate(a.GetLower(), b.GetLower(), c.GetLower()), + Vector256Utilities.MultiplyAddEstimate(a.GetUpper(), b.GetUpper(), c.GetUpper())); + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 3545bae3f7..41afec892c 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -80,7 +80,58 @@ public Vector4 Convolve(Span rowSpan) [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - if (Vector256.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated) + { + float* bufferStart = this.bufferPtr; + ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7); + Vector512 result512_0 = Vector512.Zero; + Vector512 result512_1 = Vector512.Zero; + + while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + Vector512 pixels512_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)4)); + + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); + result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1); + + bufferStart += 32; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8); + } + + result512_0 += result512_1; + + if ((this.Length & 7) >= 4) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); + + bufferStart += 16; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); + } + + Vector256 result256 = result512_0.GetLower() + result512_0.GetUpper(); + + if ((this.Length & 3) >= 2) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256); + + bufferStart += 8; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); + } + + Vector128 result128 = result256.GetLower() + result256.GetUpper(); + + if ((this.Length & 1) != 0) + { + Vector128 pixels128 = Unsafe.As>(ref rowStartRef); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); + } + + return *(Vector4*)&result128; + } + else if (Vector256.IsHardwareAccelerated) { float* bufferStart = this.bufferPtr; ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3); @@ -92,8 +143,8 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); Vector256 pixels256_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)2)); - result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0); - result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); bufferStart += 16; rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); @@ -104,7 +155,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) if ((this.Length & 3) >= 2) { Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); - result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); bufferStart += 8; rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); @@ -115,7 +166,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) if ((this.Length & 1) != 0) { Vector128 pixels128 = Unsafe.As>(ref rowStartRef); - result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); } return *(Vector4*)&result128; @@ -170,7 +221,7 @@ internal void FillOrCopyAndExpand(Span values) { for (int i = 0; i < this.Length; i++) { - this.Values[i] = (float)values[i]; + this.Values[i] = values[i]; } } } From 4728b97d85b7ad66d3d7975942ed9b99af3ad2fd Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 15 Aug 2024 16:44:41 +1000 Subject: [PATCH 3/8] Use dedicated property --- .../Transforms/Resize/ResizeKernel.cs | 150 ++++++++++-------- .../Transforms/ResizeKernelMapTests.cs | 3 +- 2 files changed, 84 insertions(+), 69 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 41afec892c..7a70caa3c6 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -23,6 +23,9 @@ internal readonly unsafe struct ResizeKernel /// /// Initializes a new instance of the struct. /// + /// The starting index for the destination row. + /// The pointer to the buffer with the convolution factors. + /// The length of the kernel. [MethodImpl(InliningOptions.ShortMethod)] internal ResizeKernel(int startIndex, float* bufferPtr, int length) { @@ -31,6 +34,15 @@ internal ResizeKernel(int startIndex, float* bufferPtr, int length) this.Length = length; } + /// + /// Gets a value indicating whether vectorization is supported. + /// + public static bool SupportsVectorization + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Vector256.IsHardwareAccelerated; + } + /// /// Gets the start index for the destination row. /// @@ -80,96 +92,99 @@ public Vector4 Convolve(Span rowSpan) [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - if (Vector512.IsHardwareAccelerated) + if (SupportsVectorization) { - float* bufferStart = this.bufferPtr; - ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7); - Vector512 result512_0 = Vector512.Zero; - Vector512 result512_1 = Vector512.Zero; - - while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + if (Vector512.IsHardwareAccelerated) { - Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); - Vector512 pixels512_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)4)); + float* bufferStart = this.bufferPtr; + ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7); + Vector512 result512_0 = Vector512.Zero; + Vector512 result512_1 = Vector512.Zero; - result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); - result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1); + while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + Vector512 pixels512_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)4)); - bufferStart += 32; - rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8); - } + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); + result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1); - result512_0 += result512_1; + bufferStart += 32; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8); + } - if ((this.Length & 7) >= 4) - { - Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); - result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); + result512_0 += result512_1; - bufferStart += 16; - rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); - } + if ((this.Length & 7) >= 4) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); - Vector256 result256 = result512_0.GetLower() + result512_0.GetUpper(); + bufferStart += 16; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); + } - if ((this.Length & 3) >= 2) - { - Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); - result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256); + Vector256 result256 = result512_0.GetLower() + result512_0.GetUpper(); - bufferStart += 8; - rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); - } + if ((this.Length & 3) >= 2) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256); - Vector128 result128 = result256.GetLower() + result256.GetUpper(); + bufferStart += 8; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); + } - if ((this.Length & 1) != 0) - { - Vector128 pixels128 = Unsafe.As>(ref rowStartRef); - result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); - } + Vector128 result128 = result256.GetLower() + result256.GetUpper(); - return *(Vector4*)&result128; - } - else if (Vector256.IsHardwareAccelerated) - { - float* bufferStart = this.bufferPtr; - ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3); - Vector256 result256_0 = Vector256.Zero; - Vector256 result256_1 = Vector256.Zero; + if ((this.Length & 1) != 0) + { + Vector128 pixels128 = Unsafe.As>(ref rowStartRef); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); + } - while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + return *(Vector4*)&result128; + } + else { - Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); - Vector256 pixels256_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)2)); + float* bufferStart = this.bufferPtr; + ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3); + Vector256 result256_0 = Vector256.Zero; + Vector256 result256_1 = Vector256.Zero; - result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); - result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); + while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + Vector256 pixels256_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)2)); - bufferStart += 16; - rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); - } + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); - result256_0 += result256_1; + bufferStart += 16; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); + } - if ((this.Length & 3) >= 2) - { - Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); - result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_0 += result256_1; - bufferStart += 8; - rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); - } + if ((this.Length & 3) >= 2) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); - Vector128 result128 = result256_0.GetLower() + result256_0.GetUpper(); + bufferStart += 8; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); + } - if ((this.Length & 1) != 0) - { - Vector128 pixels128 = Unsafe.As>(ref rowStartRef); - result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); - } + Vector128 result128 = result256_0.GetLower() + result256_0.GetUpper(); + + if ((this.Length & 1) != 0) + { + Vector128 pixels128 = Unsafe.As>(ref rowStartRef); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); + } - return *(Vector4*)&result128; + return *(Vector4*)&result128; + } } else { @@ -195,6 +210,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) /// Copy the contents of altering /// to the value . /// + /// The new value for . [MethodImpl(InliningOptions.ShortMethod)] internal ResizeKernel AlterLeftValue(int left) => new(left, this.bufferPtr, this.Length); diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs index 337f8c75dc..6d0de65c42 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Runtime.Intrinsics; using System.Text; using SixLabors.ImageSharp.Processing; using SixLabors.ImageSharp.Processing.Processors.Transforms; @@ -142,7 +141,7 @@ private void VerifyKernelMapContentIsCorrect(TResampler resampler, i Span actualValues; ApproximateFloatComparer comparer; - if (Vector256.IsHardwareAccelerated) + if (ResizeKernel.SupportsVectorization) { comparer = new ApproximateFloatComparer(1e-4f); From 8c19a979eab6ddb9f1bd10400ab6aaca3aec561d Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 15 Aug 2024 17:24:14 +1000 Subject: [PATCH 4/8] Update ResizeKernelMap.cs --- .../Processing/Processors/Transforms/Resize/ResizeKernelMap.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs index cf74de1fcd..b52054d553 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs @@ -5,7 +5,6 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Runtime.Intrinsics; using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Processing.Processors.Transforms; @@ -52,7 +51,7 @@ private ResizeKernelMap( this.DestinationLength = destinationLength; this.MaxDiameter = (radius * 2) + 1; - if (Vector256.IsHardwareAccelerated) + if (ResizeKernel.SupportsVectorization) { this.data = memoryAllocator.Allocate2D(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true); } From 7840665f04a5c1f5cc726082947de973202061c4 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 15 Aug 2024 20:41:42 +1000 Subject: [PATCH 5/8] Don't use FMA for 512 --- src/ImageSharp/Common/Helpers/Vector512Utilities.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 3325ad1aeb..8a9ba6aa44 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -138,9 +138,10 @@ public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 MultiplyAddEstimate(Vector512 a, Vector512 b, Vector512 c) - => Vector512.Create( - Vector256Utilities.MultiplyAddEstimate(a.GetLower(), b.GetLower(), c.GetLower()), - Vector256Utilities.MultiplyAddEstimate(a.GetUpper(), b.GetUpper(), c.GetUpper())); + + // Don't actually use FMA as it requires many more instruction to extract the + // upper and lower parts of the vector and then recombine them. + => (a + b) * c; [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); From 58f6afb3cd7edc3ab0763af6ad3c767caa7499be Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 22 Aug 2024 21:20:07 +1000 Subject: [PATCH 6/8] Update src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs Co-authored-by: Clinton Ingram --- .../Processing/Processors/Transforms/Resize/ResizeKernel.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 7a70caa3c6..6c1f7217a7 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -235,10 +235,7 @@ internal void FillOrCopyAndExpand(Span values) } else { - for (int i = 0; i < this.Length; i++) - { - this.Values[i] = values[i]; - } + values.CopyTo(this.Values); } } } From 0594035f945a14835daee6591550b315651a1ded Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 22 Aug 2024 21:20:17 +1000 Subject: [PATCH 7/8] Update src/ImageSharp/Common/Helpers/Numerics.cs Co-authored-by: Clinton Ingram --- src/ImageSharp/Common/Helpers/Numerics.cs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index e8f50b3eeb..621229ab0b 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -1166,13 +1166,9 @@ public static void Normalize(Span span, float sum) } else { - ref float startRef = ref MemoryMarshal.GetReference(span); - ref float endRef = ref Unsafe.Add(ref startRef, span.Length); - - while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + for (int i = 0; i < span.Length; i++) { - startRef /= sum; - startRef = ref Unsafe.Add(ref startRef, (nuint)1); + span[i] /= sum; } } } From 72813ee5714dd6cc5731e5104615825c563932d6 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 22 Aug 2024 22:24:07 +1000 Subject: [PATCH 8/8] use Avx512F.FusedMultiplyAdd --- src/ImageSharp/Common/Helpers/Vector512Utilities.cs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 8a9ba6aa44..bcc3c9fa92 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -3,6 +3,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -138,10 +139,14 @@ public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 MultiplyAddEstimate(Vector512 a, Vector512 b, Vector512 c) + { + if (Avx512F.IsSupported) + { + return Avx512F.FusedMultiplyAdd(a, b, c); + } - // Don't actually use FMA as it requires many more instruction to extract the - // upper and lower parts of the vector and then recombine them. - => (a + b) * c; + return (a + b) * c; + } [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException();