diff --git a/CMakeLists.txt b/CMakeLists.txt index eb3d358..3780085 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,6 @@ set(SPARKYUV_SOURCES src/ChannelLength.cpp src/YCbCrP16.cpp src/ChannelsReformat.cpp - src/Scale.cpp src/NV12Flyer.cpp src/NV16Flyer.cpp src/NV24Flyer.cpp @@ -44,7 +43,6 @@ set(SPARKYUV_SOURCES src/Rotate.cpp src/FastGaussian.cpp src/FastGaussian.h - src/GaussianBlur.cpp src/FastGaussianNeon.cpp src/FastGaussianNeon.h) @@ -53,8 +51,7 @@ set(HWY_SOURCES highway/hwy/nanobenchmark.cc highway/hwy/per_target.cc highway/hwy/timer.cc highway/hwy/abort.cc src/Eotf.cpp - src/Eotf-inl.h - src/GaussianBlur-inl.h) + src/Eotf-inl.h) if (BUILD_SHARED) add_library(sparkyuv SHARED ${SPARKYUV_SOURCES} ${HWY_SOURCES}) diff --git a/include/sparkyuv-basic.h b/include/sparkyuv-basic.h index 0671c05..659eecf 100644 --- a/include/sparkyuv-basic.h +++ b/include/sparkyuv-basic.h @@ -451,129 +451,6 @@ void FastGaussianNextBlurBGRAF16(uint16_t *data, uint32_t stride, uint32_t width void FastGaussianNextBlurBGRF16(uint16_t *data, uint32_t stride, uint32_t width, uint32_t height, int radius); #endif -/** - * Gaussian Blur. - * Not approximation just a gaussian blur, use when antialias or clear gaussian methods is needed. - * In-place use allowed - */ - -void GaussianBlurRGBA(const uint8_t *src, uint32_t srcStride, - uint8_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurRGB(const uint8_t *src, uint32_t srcStride, - uint8_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurChannel(const uint8_t *src, uint32_t srcStride, - uint8_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); - -void GaussianBlurRGBA16(const uint16_t *src, uint32_t srcStride, - uint16_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurRGB16(const uint16_t *src, uint32_t srcStride, - uint16_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurChannel16(const uint16_t *src, uint32_t srcStride, - uint16_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); - -void GaussianBlurRGBAF16(const uint16_t *src, uint32_t srcStride, - uint16_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurRGBF16(const uint16_t *src, uint32_t srcStride, - uint16_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurChannelF16(const uint16_t *src, uint32_t srcStride, - uint16_t *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); - -void GaussianBlurRGBAF32(const float *src, uint32_t srcStride, - float *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurRGBF32(const float *src, uint32_t srcStride, - float *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); -void GaussianBlurChannelF32(const float *src, uint32_t srcStride, - float *dst, uint32_t dstStride, - uint32_t width, uint32_t height, - int kernelSize, float sigma); - -/** - * Scaling functions - */ - -// Mark scale U8 - -void ScaleRGB(const uint8_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - SparkYuvSampler option); -void ScaleRGBA(const uint8_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - SparkYuvSampler option); -void ScaleChannel(const uint8_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - SparkYuvSampler option); -// Mark scale F16 - - -void ScaleRGBF16(const uint16_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - SparkYuvSampler option); -void ScaleRGBAF16(const uint16_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - SparkYuvSampler option); - -void ScaleChannelF16(const uint16_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - SparkYuvSampler option); - -// Mark: Scale RGBA1010102 - -void ScaleRGBA1010102(const uint8_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - SparkYuvSampler option); - -void ScaleRGBA16(const uint16_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - int depth, SparkYuvSampler option); -void ScaleRGB16(const uint16_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - int depth, SparkYuvSampler option); -void ScaleChannel16(const uint16_t *input, uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - int depth, SparkYuvSampler option); - /** * Convert from U8 to F16 */ diff --git a/src/FastGaussianNeon.cpp b/src/FastGaussianNeon.cpp index 6891f9a..777fa48 100644 --- a/src/FastGaussianNeon.cpp +++ b/src/FastGaussianNeon.cpp @@ -76,7 +76,6 @@ void VerticalGaussianPassRGBANeon(uint8_t *data, src[px + 2] = vget_lane_u8(p8, 2); int32x4_t bufferValue1 = vld1q_s32(reinterpret_cast(&buffer[arrIndex][0])); - int32x4_t bufferValue2 = vld1q_s32(reinterpret_cast(&buffer[dArrIndex][0])); bufferValue2 = vshlq_n_s32(bufferValue2, 1); diff --git a/src/GaussianBlur-inl.h b/src/GaussianBlur-inl.h deleted file mode 100644 index 2ab523b..0000000 --- a/src/GaussianBlur-inl.h +++ /dev/null @@ -1,697 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(SPARKYUV_FLIP_INL_H) == defined(HWY_TARGET_TOGGLE) -#ifdef SPARKYUV_FLIP_INL_H -#undef SPARKYUV_FLIP_INL_H -#else -#define SPARKYUV_FLIP_INL_H -#endif - -#include "hwy/highway.h" -#include "yuv-inl.h" -#include "sparkyuv-internal.h" -#include "math/gaussian.h" -#include "hwy/aligned_allocator.h" -#include "concurrency.hpp" -#include "TypeSupport.h" - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { - -using namespace hwy; -using namespace hwy::HWY_NAMESPACE; - -template::type = 0, - typename std::enable_if::value, int>::type = 0> -void -GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t /* height */, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int maxWidth = static_cast(width) - 1; - int sZero = 0; - - const FixedTag d8x16; - const Half dh8; - const Rebind d16; - const FixedTag d8x4; - const FixedTag d32; - const FixedTag df; - using VF = Vec; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - VF acc = Zero(df); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + y * srcStride); - auto kx = static_cast(x); - - for (; r + 4 <= maxKernel && kx + r + 4 < width; r += 4) { - int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4; - auto vx = LoadU(d8x16, &localSource[sourcePX]); - auto i1 = ConvertTo(df, PromoteLowerTo(d32, LowerHalf(vx))); - auto i2 = ConvertTo(df, PromoteLowerTo(d32, UpperHalf(dh8, vx))); - auto i3 = ConvertTo(df, PromoteUpperTo(d32, PromoteTo(d16, LowerHalf(vx)))); - auto i4 = ConvertTo(df, PromoteUpperTo(d32, PromoteTo(d16, UpperHalf(dh8, vx)))); - - float weight1 = mKernel[halfOfKernel + r]; - acc = MulAdd(i1, Set(df, weight1), acc); - - float weight2 = mKernel[halfOfKernel + r + 1]; - acc = MulAdd(i2, Set(df, weight2), acc); - - float weight3 = mKernel[halfOfKernel + r + 2]; - acc = MulAdd(i3, Set(df, weight3), acc); - - float weight4 = mKernel[halfOfKernel + r + 3]; - acc = MulAdd(i4, Set(df, weight4), acc); - } - - for (; r <= maxKernel; ++r) { - float weight = mKernel[halfOfKernel + r]; - int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4; - auto vx = ConvertTo(df, PromoteTo(d32, LoadU(d8x4, &localSource[sourcePX]))); - acc = MulAdd(vx, Set(df, weight), acc); - } - acc = Round(acc); - auto newPX = DemoteTo(d8x4, ConvertTo(d32, acc)); - StoreU(newPX, d8x4, dst); - dst += 4; - } - } -} - -template::type = 0, - typename std::enable_if::value, int>::type = 0, - ENABLE_TYPE_IS_F16(T)> -void -GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t height, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int64_t maxHeight = static_cast(height) - 1; - - const FixedTag d16x8; - const FixedTag d16x4; - const FixedTag df16; - const FixedTag df16x4; - const FixedTag df; - using VF = Vec; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - - VF accumulator = Zero(df); - - auto kx = static_cast(x) * 4; - - for (; r <= maxKernel; ++r) { - uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), - static_cast(0), - static_cast(maxHeight)); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); - float weight = mKernel[halfOfKernel + r]; - VF pixelData; -#if SPARKYUV_ALLOW_FLOAT16 - const auto pxf16 = LoadU(df16x4, &localSource[kx]); - pixelData = PromoteTo(df, pxf16); -#else - const auto pxf16 = BitCast(df16x4, LoadU(d16x4, reinterpret_cast(&localSource[kx]))); - pixelData = PromoteTo(df, pxf16); -#endif - accumulator = MulAdd(pixelData, Set(df, weight), accumulator); - } - -#if SPARKYUV_ALLOW_FLOAT16 - StoreU(DemoteTo(df16x4, accumulator), df16x4, dst); -#else - auto duStore = BitCast(d16x4, DemoteTo(df16x4, accumulator)); - StoreU(duStore, d16x4, reinterpret_cast(dst)); -#endif - - dst += 4; - } - } -} - -template::type = 0, - typename std::enable_if::value, int>::type = 0, - ENABLE_TYPE_IS_F16(T)> -void -GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t /* height */, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int maxWidth = static_cast(width) - 1; - int sZero = 0; - - const FixedTag d16x8; - const FixedTag d16x4; - const FixedTag df16; - const FixedTag df16x4; - const FixedTag df; - using VF = Vec; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + y * srcStride); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - - VF accumulator = Zero(df); - auto kx = static_cast(x); - - for (; r + 2 <= maxKernel && kx + x + 2 < width; r += 2) { - int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4; - auto movedSrc = localSource + sourcePX; - const float weight1 = mKernel[halfOfKernel + r]; - const float weight2 = mKernel[halfOfKernel + r + 1]; - VF pixelData1; - VF pixelData2; - -#if SPARKYUV_ALLOW_FLOAT16 - const auto pxf16 = LoadU(df16, movedSrc); - pixelData1 = PromoteLowerTo(df, pxf16); - pixelData2 = PromoteUpperTo(df, pxf16); -#else - const auto pxf16 = BitCast(df16, LoadU(d16x8, reinterpret_cast(movedSrc))); - pixelData1 = PromoteLowerTo(df, pxf16); - pixelData2 = PromoteUpperTo(df, pxf16); -#endif - - accumulator = MulAdd(pixelData1, Set(df, weight1), accumulator); - accumulator = MulAdd(pixelData2, Set(df, weight2), accumulator); - } - - for (; r <= maxKernel; ++r) { - float weight = mKernel[halfOfKernel + r]; - int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4; - auto movedSrc = localSource + sourcePX; - VF pixelData; -#if SPARKYUV_ALLOW_FLOAT16 - const auto pxf16 = LoadU(df16x4, movedSrc); - pixelData = PromoteTo(df, pxf16); -#else - const auto pxf16 = BitCast(df16x4, LoadU(d16x4, reinterpret_cast(movedSrc))); - pixelData = PromoteTo(df, pxf16); -#endif - accumulator = MulAdd(pixelData, Set(df, weight), accumulator); - } - -#if SPARKYUV_ALLOW_FLOAT16 - StoreU(DemoteTo(df16x4, accumulator), df16x4, dst); -#else - auto duStore = BitCast(d16x4, DemoteTo(df16x4, accumulator)); - StoreU(duStore, d16x4, reinterpret_cast(dst)); -#endif - - dst += 4; - } - } -} - -template::type = 0, - typename std::enable_if::value, int>::type = 0, - ENABLE_TYPE_IS_F16(T)> -void -GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t height, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int64_t maxHeight = static_cast(height) - 1; - - const FixedTag d8x4; - const FixedTag d32; - const FixedTag df; - using VF = Vec; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - VF acc = Zero(df); - - for (; r <= maxKernel; ++r) { - uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), - static_cast(0), - static_cast(maxHeight)); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); - float weight = mKernel[halfOfKernel + r]; - uint32_t sourcePX = x * 4; - auto vx = ConvertTo(df, PromoteTo(d32, LoadU(d8x4, &localSource[sourcePX]))); - acc = MulAdd(vx, Set(df, weight), acc); - } - acc = Round(acc); - auto newPX = DemoteTo(d8x4, ConvertTo(d32, acc)); - StoreU(newPX, d8x4, dst); - dst += 4; - } - } -} - -template::type = 0, - typename std::enable_if::value, int>::type = 0, - ENABLE_TYPE_IS_NOT_F16(T)> -void -GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t /* height */, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int maxWidth = static_cast(width) - 1; - int sZero = 0; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - float accumulator1 = 0.f; - float accumulator2 = 0.f; - float accumulator3 = 0.f; - float accumulator4 = 0.f; - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + y * srcStride); - auto kx = static_cast(x); - for (; r <= maxKernel; ++r) { - float weight = mKernel[halfOfKernel + r]; - int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4; - auto movedSrc = localSource + sourcePX; - // Stupid workaround to avoid errors where hwy f16 not really properly works - accumulator1 += LoadFloat(&movedSrc[0]) * weight; - accumulator2 += LoadFloat(&movedSrc[1]) * weight; - accumulator3 += LoadFloat(&movedSrc[2]) * weight; - accumulator4 += LoadFloat(&movedSrc[3]) * weight; - } - if (!std::is_same::value) { - StoreRoundedFloat(&dst[0], accumulator1); - StoreRoundedFloat(&dst[1], accumulator2); - StoreRoundedFloat(&dst[2], accumulator3); - StoreRoundedFloat(&dst[3], accumulator4); - } else { - StoreFloat(&dst[0], accumulator1); - StoreFloat(&dst[1], accumulator2); - StoreFloat(&dst[2], accumulator3); - StoreFloat(&dst[3], accumulator4); - } - dst += 4; - } - } -} - -template::type = 0> -void -GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t /* height */, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int maxWidth = static_cast(width) - 1; - int sZero = 0; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - float accumulator1 = 0.f; - float accumulator2 = 0.f; - float accumulator3 = 0.f; - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + y * srcStride); - auto kx = static_cast(x); - for (; r <= maxKernel; ++r) { - float weight = mKernel[halfOfKernel + r]; - int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 3; - accumulator1 += LoadFloat(&localSource[sourcePX]) * weight; - accumulator2 += LoadFloat(&localSource[sourcePX + 1]) * weight; - accumulator3 += LoadFloat(&localSource[sourcePX + 2]) * weight; - } - if (!std::is_same::value) { - StoreRoundedFloat(&dst[0], accumulator1); - StoreRoundedFloat(&dst[1], accumulator2); - StoreRoundedFloat(&dst[2], accumulator3); - } else { - StoreFloat(&dst[0], accumulator1); - StoreFloat(&dst[1], accumulator2); - StoreFloat(&dst[2], accumulator3); - } - dst += 3; - } - } -} - -template::type = 0> -void -GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t /* height */, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int maxWidth = static_cast(width) - 1; - int sZero = 0; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - float accumulator = 0.f; - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + y * srcStride); - auto kx = static_cast(x); - for (; r <= maxKernel; ++r) { - accumulator += LoadFloat(&localSource[std::clamp(kx + r, sZero, maxWidth)]) * mKernel[halfOfKernel + r]; - } - if (!std::is_same::value) { - StoreRoundedFloat(&dst[0], accumulator); - } else { - StoreFloat(&dst[0], accumulator); - } - dst += 1; - } - } -} - -template::type = 0> -void -GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t height, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int64_t maxHeight = static_cast(height) - 1; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - float accumulator = 0.f; - auto kx = static_cast(x); - for (; r <= maxKernel; ++r) { - uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), - static_cast(0), - static_cast(maxHeight)); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); - // Stupid workaround to avoid errors where hwy f16 not really properly works - accumulator += LoadFloat(&localSource[kx]) * mKernel[halfOfKernel + r]; - } - if (!std::is_same::value) { - StoreRoundedFloat(&dst[0], accumulator); - } else { - StoreFloat(&dst[0], accumulator); - } - dst += 1; - } - } -} - -template::type = 0> -void -GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t height, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int64_t maxHeight = static_cast(height) - 1; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - float accumulator = 0.f; - float accumulator1 = 0.f; - float accumulator2 = 0.f; - auto kx = static_cast(x) * 3; - for (; r <= maxKernel; ++r) { - uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), - static_cast(0), - static_cast(maxHeight)); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); - float weight = mKernel[halfOfKernel + r]; - accumulator += LoadFloat(&localSource[kx]) * weight; - accumulator1 += LoadFloat(&localSource[kx + 1]) * weight; - accumulator2 += LoadFloat(&localSource[kx + 2]) * weight; - } - if (!std::is_same::value) { - StoreRoundedFloat(&dst[0], accumulator); - StoreRoundedFloat(&dst[1], accumulator1); - StoreRoundedFloat(&dst[2], accumulator2); - } else { - StoreFloat(&dst[0], accumulator); - StoreFloat(&dst[1], accumulator1); - StoreFloat(&dst[2], accumulator2); - } - dst += 3; - } - } -} - -template::type = 0, - typename std::enable_if::value, int>::type = 0, - HWY_IF_NOT_F16(T)> -void -GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t height, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int64_t maxHeight = static_cast(height) - 1; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - float accumulator = 0.f; - float accumulator1 = 0.f; - float accumulator2 = 0.f; - float accumulator3 = 0.f; - auto kx = static_cast(x) * 4; - for (; r <= maxKernel; ++r) { - uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), - static_cast(0), - static_cast(maxHeight)); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); - float weight = mKernel[halfOfKernel + r]; - // Stupid workaround to avoid errors where hwy f16 not really properly works - accumulator += LoadFloat(&localSource[kx]) * weight; - accumulator1 += LoadFloat(&localSource[kx + 1]) * weight; - accumulator2 += LoadFloat(&localSource[kx + 2]) * weight; - accumulator3 += LoadFloat(&localSource[kx + 3]) * weight; - } - if (!std::is_same::value) { - StoreRoundedFloat(&dst[0], accumulator); - StoreRoundedFloat(&dst[1], accumulator1); - StoreRoundedFloat(&dst[2], accumulator2); - StoreRoundedFloat(&dst[3], accumulator3); - } else { - StoreFloat(&dst[0], accumulator); - StoreFloat(&dst[1], accumulator1); - StoreFloat(&dst[2], accumulator2); - StoreFloat(&dst[3], accumulator3); - } - dst += 4; - } - } -} - -template::type = 0, - typename std::enable_if::value, int>::type = 0> -void -GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, - const uint32_t startY, const uint32_t endY, - const uint32_t width, const uint32_t height, - const float *mKernel, const int kernelSize) { - const int halfOfKernel = kernelSize / 2; - const bool isEven = kernelSize % 2 == 0; - const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; - - auto mDst = reinterpret_cast(mDestination); - int64_t maxHeight = static_cast(height) - 1; - - const FixedTag d8x4; - const FixedTag d32; - const FixedTag df; - using VF = Vec; - - for (uint32_t y = startY; y < endY; ++y) { - auto dst = reinterpret_cast(mDst + dstStride * y); - for (uint32_t x = 0; x < width; ++x) { - int r = -halfOfKernel; - VF acc = Zero(df); - - for (; r <= maxKernel; ++r) { - uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), - static_cast(0), - static_cast(maxHeight)); - auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); - float weight = mKernel[halfOfKernel + r]; - uint32_t sourcePX = x * 4; - auto vx = ConvertTo(df, PromoteTo(d32, LoadU(d8x4, &localSource[sourcePX]))); - acc = MulAdd(vx, Set(df, weight), acc); - } - acc = Round(acc); - auto newPX = DemoteTo(d8x4, ConvertTo(d32, acc)); - StoreU(newPX, d8x4, dst); - dst += 4; - } - } -} - -template -void -GaussianBlurImpl(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, - T *SPARKYUV_RESTRICT mDestination, const uint32_t newStride, - const uint32_t width, const uint32_t height, const int kernelSize, - const float sigma) { - const auto kernel = Get1DGaussianKernel(kernelSize, sigma); - const auto transient = hwy::AllocateAligned(newStride * height); - const auto threadCount = concurrency::getThreadCounts(width, height); - const auto alignedKernel = hwy::AllocateAligned(kernel.size()); - std::copy(kernel.begin(), kernel.end(), alignedKernel.get()); - concurrency::parallel_for_segment(threadCount, height, [&](int start, int end) { - GaussianBlurHorizontalPass(mSource, - srcStride, - reinterpret_cast(transient.get()), - newStride, - start, - end, - width, - height, - reinterpret_cast(alignedKernel.get()), - kernel.size()); - }); - - concurrency::parallel_for_segment(threadCount, height, [&](int start, int end) { - GaussianBlurVerticalPass(reinterpret_cast(transient.get()), - newStride, - mDestination, - newStride, - start, - end, - width, - height, - reinterpret_cast(alignedKernel.get()), - kernel.size()); - }); -} - -#define GAUSSIAN_BLUR_DECLARATION_R(srcPixel, storageType, surfaceType) \ - void GaussianBlur##srcPixel##HWY(const storageType *SPARKYUV_RESTRICT src, const uint32_t srcStride,\ - storageType *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\ - const uint32_t width, const uint32_t height, \ - const int kernelSize, const float sigma) {\ - GaussianBlurImpl(src, srcStride, dst, dstStride,\ - width, height, kernelSize, sigma); \ - } - -GAUSSIAN_BLUR_DECLARATION_R(RGBA, uint8_t, CHANNELS_4) -GAUSSIAN_BLUR_DECLARATION_R(RGB, uint8_t, CHANNELS_3) -GAUSSIAN_BLUR_DECLARATION_R(Channel, uint8_t, CHANNEL) - -GAUSSIAN_BLUR_DECLARATION_R(RGBA16, uint16_t, CHANNELS_4) -GAUSSIAN_BLUR_DECLARATION_R(RGB16, uint16_t, CHANNELS_3) -GAUSSIAN_BLUR_DECLARATION_R(Channel16, uint16_t, CHANNEL) - -GAUSSIAN_BLUR_DECLARATION_R(RGBAF32, float, CHANNELS_4) -GAUSSIAN_BLUR_DECLARATION_R(RGBF32, float, CHANNELS_3) -GAUSSIAN_BLUR_DECLARATION_R(ChannelF32, float, CHANNEL) - -#undef GAUSSIAN_BLUR_DECLARATION_R - -#define GAUSSIAN_BLUR_DECLARATION_R_F16(srcPixel, surfaceType) \ - void GaussianBlur##srcPixel##HWY(const uint16_t *SPARKYUV_RESTRICT src, const uint32_t srcStride,\ - uint16_t *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\ - const uint32_t width, const uint32_t height, \ - const int kernelSize, const float sigma) {\ - GaussianBlurImpl(reinterpret_cast(src), \ - srcStride, reinterpret_cast(dst), dstStride, width, height, kernelSize, sigma); \ - } - -GAUSSIAN_BLUR_DECLARATION_R_F16(RGBAF16, CHANNELS_4) -GAUSSIAN_BLUR_DECLARATION_R_F16(RGBF16, CHANNELS_3) -GAUSSIAN_BLUR_DECLARATION_R_F16(ChannelF16, CHANNEL) - -#undef GAUSSIAN_BLUR_DECLARATION_R_F16 - -} -HWY_AFTER_NAMESPACE(); - -#endif \ No newline at end of file diff --git a/src/GaussianBlur.cpp b/src/GaussianBlur.cpp deleted file mode 100644 index 3a7c46d..0000000 --- a/src/GaussianBlur.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "sparkyuv.h" - -#undef HWY_TARGET_INCLUDE -#define HWY_TARGET_INCLUDE "src/GaussianBlur.cpp" - -#include "hwy/foreach_target.h" -#include "hwy/highway.h" -#include "yuv-inl.h" -#include "GaussianBlur-inl.h" - -#if HWY_ONCE -namespace sparkyuv { -#define GAUSSIAN_BLUR_DECLARE_EXPORT(srcPixel) HWY_EXPORT(GaussianBlur##srcPixel##HWY); -GAUSSIAN_BLUR_DECLARE_EXPORT(RGBA) -GAUSSIAN_BLUR_DECLARE_EXPORT(RGB) -GAUSSIAN_BLUR_DECLARE_EXPORT(Channel) -GAUSSIAN_BLUR_DECLARE_EXPORT(RGBA16) -GAUSSIAN_BLUR_DECLARE_EXPORT(RGB16) -GAUSSIAN_BLUR_DECLARE_EXPORT(Channel16) -GAUSSIAN_BLUR_DECLARE_EXPORT(RGBAF16) -GAUSSIAN_BLUR_DECLARE_EXPORT(RGBF16) -GAUSSIAN_BLUR_DECLARE_EXPORT(ChannelF16) -GAUSSIAN_BLUR_DECLARE_EXPORT(RGBAF32) -GAUSSIAN_BLUR_DECLARE_EXPORT(RGBF32) -GAUSSIAN_BLUR_DECLARE_EXPORT(ChannelF32) -#undef GAUSSIAN_BLUR_DECLARE_EXPORT - -#define GAUSSIAN_BLUR_DECLARATION_E(srcPixel, storageType) \ - void GaussianBlur##srcPixel(const storageType *SPARKYUV_RESTRICT src, const uint32_t srcStride,\ - storageType *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\ - const uint32_t width, const uint32_t height, \ - const int kernelSize, const float sigma) {\ - HWY_DYNAMIC_DISPATCH(GaussianBlur##srcPixel##HWY)(src, srcStride, dst, dstStride,\ - width, height, kernelSize, sigma); \ - } - -GAUSSIAN_BLUR_DECLARATION_E(RGBA, uint8_t) -GAUSSIAN_BLUR_DECLARATION_E(RGB, uint8_t) -GAUSSIAN_BLUR_DECLARATION_E(Channel, uint8_t) - -GAUSSIAN_BLUR_DECLARATION_E(RGBA16, uint16_t) -GAUSSIAN_BLUR_DECLARATION_E(RGB16, uint16_t) -GAUSSIAN_BLUR_DECLARATION_E(Channel16, uint16_t) - -GAUSSIAN_BLUR_DECLARATION_E(RGBAF32, float) -GAUSSIAN_BLUR_DECLARATION_E(RGBF32, float) -GAUSSIAN_BLUR_DECLARATION_E(ChannelF32, float) - -#undef GAUSSIAN_BLUR_DECLARATION_E - -#define GAUSSIAN_BLUR_DECLARATION_R_F16(srcPixel, surfaceType) \ - void GaussianBlur##srcPixel(const uint16_t *SPARKYUV_RESTRICT src, const uint32_t srcStride,\ - uint16_t *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\ - const uint32_t width, const uint32_t height, \ - const int kernelSize, const float sigma) {\ - HWY_DYNAMIC_DISPATCH(GaussianBlur##srcPixel##HWY)(src, srcStride, dst, dstStride, \ - width, height, kernelSize, sigma); \ - } - -GAUSSIAN_BLUR_DECLARATION_R_F16(RGBAF16, CHANNELS_4) -GAUSSIAN_BLUR_DECLARATION_R_F16(RGBF16, CHANNELS_3) -GAUSSIAN_BLUR_DECLARATION_R_F16(ChannelF16, CHANNEL) - -#undef GAUSSIAN_BLUR_DECLARATION_R_F16 -} -#endif \ No newline at end of file diff --git a/src/Scale.cpp b/src/Scale.cpp deleted file mode 100644 index 2cb8a94..0000000 --- a/src/Scale.cpp +++ /dev/null @@ -1,729 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#undef HWY_TARGET_INCLUDE -#define HWY_TARGET_INCLUDE "src/Scale.cpp" - -#include "hwy/foreach_target.h" -#include "hwy/highway.h" -#include "yuv-inl.h" -#include "sampler/NearestRowSampler-inl.hpp" -#include "sampler/BilinearRowSampler-inl.hpp" -#include "sampler/Window4RowSampler-inl.hpp" -#include "sampler/Window6RowSampler-inl.hpp" -#include "sampler/BoxRowSampler-inl.h" -#include "concurrency.hpp" - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { - -template -void ScaleRGB16OrChannelHWY(const uint16_t *input, const uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, const uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - const int depth, const SparkYuvSampler option) { - - auto src = reinterpret_cast(input); - - std::unique_ptr> sampler; - switch (option) { - case box: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case hermite: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight, - depth); - } - break; - case catmullRom: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight, - depth); - } - break; - case bSpline: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight, - depth); - } - break; - case cubic: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight, depth); - } - break; - case bicubic: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight, - depth); - } - break; - case mitchell: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight, - depth); - } - break; - case lanczos: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight, - depth); - } - break; - case bilinear: { - sampler = std::make_unique>(src, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - default: { - sampler = std::make_unique>(src, srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - } - - const int threadCount = std::clamp(std::min(static_cast(std::thread::hardware_concurrency()), - static_cast(outputHeight * outputWidth / (256 * 256))), - static_cast(1), static_cast(12)); - - concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) { - sampler->sample(iterationId); - }); -} - -template -void ScaleRGB8OrChannelHWY(const uint8_t *input, - const uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, - const uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - const SparkYuvSampler option) { - - auto src8 = reinterpret_cast(input); - - std::unique_ptr> sampler; - switch (option) { - case box: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case hermite: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case catmullRom: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case bSpline: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case cubic: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case bicubic: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case mitchell: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case lanczos: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case bilinear: { - if (Components == 4) { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } else { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - } - break; - default: { - sampler = std::make_unique>(src8, srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - } - - const int threadCount = std::clamp(std::min(static_cast(std::thread::hardware_concurrency()), - static_cast(outputHeight * outputWidth / (256 * 256))), - static_cast(1), static_cast(12)); - - concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) { - sampler->sample(iterationId); - }); -} - -void ScaleRGB1010102OrChannelHWY(const uint8_t *input, - const uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, - const uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - const SparkYuvSampler option) { - - auto src8 = reinterpret_cast(input); - - std::unique_ptr> sampler; - switch (option) { - case box: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case hermite: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case catmullRom: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case bSpline: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case cubic: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case bicubic: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case mitchell: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case lanczos: { - sampler = - std::make_unique>(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - case bilinear: { - sampler = std::make_unique(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - default: { - sampler = std::make_unique(reinterpret_cast(src8), - srcStride, - inputWidth, - inputHeight, - reinterpret_cast(output), - dstStride, - outputWidth, - outputHeight); - } - break; - } - - const int threadCount = std::clamp(std::min(static_cast(std::thread::hardware_concurrency()), - static_cast(outputHeight * outputWidth / (256 * 256))), - static_cast(1), static_cast(12)); - - concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) { - sampler->sample(iterationId); - }); -} - -template -void ScaleRGB16FOrChannelHWY(const uint16_t *input, - const uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint16_t *output, const uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - const SparkYuvSampler option) { - - auto src8 = reinterpret_cast(input); - - std::unique_ptr> sampler; - switch (option) { - case box: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case hermite: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case catmullRom: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case bSpline: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case cubic: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case bicubic: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case mitchell: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case lanczos: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - case bilinear: { - sampler = std::make_unique>(src8, - srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - default: { - sampler = std::make_unique>(src8, srcStride, - inputWidth, - inputHeight, - output, - dstStride, - outputWidth, - outputHeight); - } - break; - } - - const int threadCount = std::clamp(std::min(static_cast(std::thread::hardware_concurrency()), - static_cast(outputHeight * outputWidth / (256 * 256))), - static_cast(1), static_cast(12)); - - concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) { - sampler->sample(iterationId); - }); -} - -#define SCALE_CHANNEL_16_TYPE(channelName, channelsCount) \ - void Scale##channelName##HWY(const uint16_t *input, const uint32_t srcStride,\ - uint32_t inputWidth, uint32_t inputHeight,\ - uint16_t *output, const uint32_t dstStride,\ - uint32_t outputWidth, uint32_t outputHeight, \ - const int depth, const SparkYuvSampler option) {\ - ScaleRGB16OrChannelHWY(input, srcStride, inputWidth, inputHeight, output, dstStride, outputWidth, outputHeight, depth, option);\ - } - -SCALE_CHANNEL_16_TYPE(Channel16, 1) -SCALE_CHANNEL_16_TYPE(RGB16, 3) -SCALE_CHANNEL_16_TYPE(RGBA16, 4) - -#undef SCALE_CHANNEL_16_TYPE - -#define SCALE_CHANNEL_TYPE(channelName, channelsCount) \ - void Scale##channelName##HWY(const uint8_t *input, const uint32_t srcStride,\ - uint32_t inputWidth, uint32_t inputHeight,\ - uint8_t *output,\ - const uint32_t dstStride,\ - uint32_t outputWidth, uint32_t outputHeight,\ - const SparkYuvSampler option) {\ - ScaleRGB8OrChannelHWY(input, srcStride, inputWidth, inputHeight, output, dstStride, outputWidth, outputHeight, option);\ - } - -SCALE_CHANNEL_TYPE(Channel, 1) -SCALE_CHANNEL_TYPE(RGB, 3) -SCALE_CHANNEL_TYPE(RGBA, 4) - -#undef SCALE_CHANNEL_TYPE - -void ScaleRGBA1010102HWY(const uint8_t *input, const uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, - const uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - const SparkYuvSampler option) { - ScaleRGB1010102OrChannelHWY(input, srcStride, - inputWidth, inputHeight, - output, dstStride, outputWidth, outputHeight, option); - -} - -#define SCALE_CHANNEL_F16_TYPE(channelName, channelsCount) \ - void Scale##channelName##HWY(const uint16_t *input, const uint32_t srcStride,\ - uint32_t inputWidth, uint32_t inputHeight,\ - uint16_t *output,\ - const uint32_t dstStride,\ - uint32_t outputWidth, uint32_t outputHeight,\ - const SparkYuvSampler option) {\ - ScaleRGB16FOrChannelHWY(input, srcStride, inputWidth, inputHeight, output, dstStride, outputWidth, outputHeight, option);\ - } - -SCALE_CHANNEL_F16_TYPE(ChannelF16, 1) -SCALE_CHANNEL_F16_TYPE(RGBF16, 3) -SCALE_CHANNEL_F16_TYPE(RGBAF16, 4) - -#undef SCALE_CHANNEL_F16_TYPE -} -HWY_AFTER_NAMESPACE(); - -#if HWY_ONCE -namespace sparkyuv { -#define SCALE_CHANNEL_TYPE_DECLARE_HWY(channelName) HWY_EXPORT(Scale##channelName##HWY); - -SCALE_CHANNEL_TYPE_DECLARE_HWY(Channel) -SCALE_CHANNEL_TYPE_DECLARE_HWY(RGB) -SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBA) - -SCALE_CHANNEL_TYPE_DECLARE_HWY(Channel16) -SCALE_CHANNEL_TYPE_DECLARE_HWY(RGB16) -SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBA16) - -SCALE_CHANNEL_TYPE_DECLARE_HWY(ChannelF16) -SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBF16) -SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBAF16) - -#undef SCALE_CHANNEL_TYPE_DECLARE_HWY - -#define SCALE_CHANNEL_DECLARE_E(channelName, channelsCount) \ - HWY_DLLEXPORT void Scale##channelName(const uint8_t *input, const uint32_t srcStride,\ - uint32_t inputWidth, uint32_t inputHeight,\ - uint8_t *output,\ - const uint32_t dstStride,\ - uint32_t outputWidth, uint32_t outputHeight,\ - const SparkYuvSampler option) {\ - HWY_DYNAMIC_DISPATCH(Scale##channelName##HWY)(input, srcStride, inputWidth, inputHeight, \ - output, dstStride, outputWidth, outputHeight, option);\ - } - -SCALE_CHANNEL_DECLARE_E(Channel, 1) -SCALE_CHANNEL_DECLARE_E(RGB, 3) -SCALE_CHANNEL_DECLARE_E(RGBA, 4) - -#undef SCALE_CHANNEL_DECLARE_E - -#define SCALE_CHANNEL_F16_DECLARE_E(channelName, channelsCount) \ - HWY_DLLEXPORT void Scale##channelName(const uint16_t *input, const uint32_t srcStride,\ - uint32_t inputWidth, uint32_t inputHeight,\ - uint16_t *output, const uint32_t dstStride,\ - uint32_t outputWidth, uint32_t outputHeight,\ - const SparkYuvSampler option) {\ - HWY_DYNAMIC_DISPATCH(Scale##channelName##HWY)(input, srcStride, inputWidth, inputHeight, \ - output, dstStride, outputWidth, outputHeight, option);\ - } - -SCALE_CHANNEL_F16_DECLARE_E(ChannelF16, 1) -SCALE_CHANNEL_F16_DECLARE_E(RGBF16, 3) -SCALE_CHANNEL_F16_DECLARE_E(RGBAF16, 4) - -#undef SCALE_CHANNEL_F16_DECLARE_E - -HWY_EXPORT(ScaleRGBA1010102HWY); - -HWY_DLLEXPORT void ScaleRGBA1010102(const uint8_t *input, const uint32_t srcStride, - uint32_t inputWidth, uint32_t inputHeight, - uint8_t *output, const uint32_t dstStride, - uint32_t outputWidth, uint32_t outputHeight, - const SparkYuvSampler option) { - HWY_DYNAMIC_DISPATCH(ScaleRGBA1010102HWY)(input, srcStride, - inputWidth, inputHeight, - output, dstStride, outputWidth, outputHeight, option); - -} - -#define SCALE_CHANNEL_16_TYPE_E(channelName) \ - void Scale##channelName(const uint16_t *input, const uint32_t srcStride,\ - uint32_t inputWidth, uint32_t inputHeight,\ - uint16_t *output, const uint32_t dstStride,\ - uint32_t outputWidth, uint32_t outputHeight, \ - const int depth, const SparkYuvSampler option) {\ - HWY_DYNAMIC_DISPATCH(Scale##channelName##HWY)(input, srcStride, inputWidth, inputHeight, output, dstStride, \ - outputWidth, outputHeight, depth, option);\ - } - -SCALE_CHANNEL_16_TYPE_E(Channel16) -SCALE_CHANNEL_16_TYPE_E(RGB16) -SCALE_CHANNEL_16_TYPE_E(RGBA16) - -#undef SCALE_CHANNEL_16_TYPE - -} -#endif \ No newline at end of file diff --git a/src/concurrency.hpp b/src/concurrency.hpp index e27c0e1..5a023fa 100644 --- a/src/concurrency.hpp +++ b/src/concurrency.hpp @@ -152,7 +152,7 @@ void parallel_for_segment(const int numThreads, const uint32_t numIterations, Fu #if THREADS_SUPPORTED std::vector threads; - int segmentHeight = numIterations / static_cast(numThreads); + auto segmentHeight = static_cast(numIterations) / static_cast(numThreads); auto parallelWorker = [&](int start, int end) { std::invoke(func, start, end, std::forward(args)...); @@ -170,8 +170,8 @@ void parallel_for_segment(const int numThreads, const uint32_t numIterations, Fu } } - int start = 0; - int end = segmentHeight; + uint32_t start = 0; + uint32_t end = segmentHeight; if (numThreads == 1) { end = numIterations; } diff --git a/src/math/gaussian.h b/src/math/gaussian.h deleted file mode 100644 index 984faae..0000000 --- a/src/math/gaussian.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef YUV_SRC_MATH_GAUSSIAN_H_ -#define YUV_SRC_MATH_GAUSSIAN_H_ - -#include -#include -#include - -#ifdef _MSC_VER -#define _USE_MATH_DEFINES -#include -#endif -#include - -#ifndef M_PI_F -#define M_PI_F 3.14159265358979323846 -#endif - -namespace { -using namespace std; -vector Get1DGaussianKernel(int width, float sigma) { - vector kernel(ceil(width)); - int mean = width / 2; - float sum = 0.f; - const float scale = 1.f / (::sqrtf(2.f * M_PI_F) * sigma); - for (int x = 0; x < width; x++) { - kernel[x] = ::expf(-0.5f * ::powf(static_cast(x - mean) / sigma, 2.0f)) * scale; - sum += kernel[x]; - } - if (sum != 0.f) { - for (int x = 0; x < width; x++) - kernel[x] /= sum; - } - return std::move(kernel); -} -} - -#endif //YUV_SRC_MATH_GAUSSIAN_H_ diff --git a/src/sampler/BilinearRowSampler-inl.hpp b/src/sampler/BilinearRowSampler-inl.hpp deleted file mode 100644 index 7ad91f3..0000000 --- a/src/sampler/BilinearRowSampler-inl.hpp +++ /dev/null @@ -1,481 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(SPARKYUV_BILINEAR_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE) -#ifdef SPARKYUV_BILINEAR_ROW_SAMPLER -#undef SPARKYUV_BILINEAR_ROW_SAMPLER -#else -#define SPARKYUV_BILINEAR_ROW_SAMPLER -#endif - -#include -#include "ScaleRowSampler.hpp" -#include "../yuv-inl.h" -#include "sampler.h" -#include -#include -#include - -#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128 -#define BILINEAR_ENABLE_HWY 1 -#else -#define BILINEAR_ENABLE_HWY 0 -#endif - -#if BILINEAR_ENABLE_HWY -#include "sampler-inl.h" -#endif - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { -using namespace sparkyuv; -using namespace hwy; -using namespace hwy::HWY_NAMESPACE; - -template -class BilinearRowSampler4Chan8Bit : public ScaleRowSampler { - public: - BilinearRowSampler4Chan8Bit(const uint8_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint8_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - - } - - ~BilinearRowSampler4Chan8Bit() override = default; - - void sample(const int row) override { -#if BILINEAR_ENABLE_HWY - const FixedTag dfx4; - const FixedTag dix4; - const FixedTag du8x4; - using VU8x4 = Vec; - - using VI4 = Vec; - using VF4 = Vec; - const uint32_t shift[4] = {0, 1, 2, 3}; - const VI4 shiftV = LoadU(dix4, shift); - const FixedTag dux4; - const VF4 xScaleV = Set(dfx4, this->xScale); - const VF4 yScaleV = Set(dfx4, this->yScale); - - const VI4 maxWidth = Set(dix4, this->inputWidth - 1); - const VI4 maxHeight = Set(dix4, this->inputHeight - 1); - - const VI4 addOne = Set(dix4, 1); - - const VF4 vfZeros = Zero(dfx4); - const VI4 srcStrideV = Set(dix4, this->srcStride); - const VF4 maxColorsV = Set(dfx4, maxColors); -#endif - auto dst8 = reinterpret_cast(reinterpret_cast(this->mDestination) + row * this->dstStride); - auto dst = reinterpret_cast(dst8); - - const uint8_t *src8 = this->mSource; - const int components = Components; - - uint32_t x = 0; - -#if BILINEAR_ENABLE_HWY -#if !NOACCELERATED_SAMPLER - for (; x + 8 < this->outputWidth && components == 4; ++x) { - VI4 currentX = Set(dix4, x); - VI4 currentXV = Add(currentX, shiftV); - VF4 currentXVF = Mul(ConvertTo(dfx4, currentXV), xScaleV); - VF4 currentYVF = Mul(ConvertTo(dfx4, Set(dix4, row)), yScaleV); - - VI4 xi1 = ConvertTo(dix4, Floor(currentXVF)); - VI4 yi1 = Min(ConvertTo(dix4, Floor(currentYVF)), maxHeight); - - VI4 xi2 = Min(Add(xi1, addOne), maxWidth); - VI4 yi2 = Min(Add(yi1, addOne), maxHeight); - - VF4 dx = Max(Sub(currentXVF, ConvertTo(dfx4, xi1)), vfZeros); - VF4 dy = Max(Sub(currentYVF, ConvertTo(dfx4, yi1)), vfZeros); - - VI4 row1Add = Mul(yi1, srcStrideV); - VI4 row2Add = Mul(yi2, srcStrideV); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int i = 0; i < 4; i++) { - auto row1 = reinterpret_cast(src8 + ExtractLane(row1Add, i)); - auto row2 = reinterpret_cast(src8 + ExtractLane(row2Add, i)); - - VU8x4 lane = LoadU(du8x4, reinterpret_cast(&row1[ExtractLane(xi1, i) * components])); - VF4 c1 = PromoteTo(dfx4, lane); - lane = LoadU(du8x4, reinterpret_cast(&row1[ExtractLane(xi2, i) * components])); - VF4 c2 = PromoteTo(dfx4, lane); - lane = LoadU(du8x4, reinterpret_cast(&row2[ExtractLane(xi1, i) * components])); - VF4 c3 = PromoteTo(dfx4, lane); - lane = LoadU(du8x4, reinterpret_cast(&row2[ExtractLane(xi2, i) * components])); - VF4 c4 = PromoteTo(dfx4, lane); - VF4 value = Blerp(dfx4, c1, c2, c3, c4, Set(dfx4, ExtractLane(dx, i)), - Set(dfx4, ExtractLane(dy, i))); - VF4 sum = Clamp(Round(value), vfZeros, maxColorsV); - VU8x4 pixel = DemoteTo(du8x4, ConvertTo(dux4, sum)); - auto u8Store = &dst[ExtractLane(currentXV, i) * components]; - StoreU(pixel, du8x4, u8Store); - } - - x += components - 1; - } -#endif -#endif - - for (; x < this->outputWidth; ++x) { - const float srcX = (float) x * this->xScale; - const float srcY = (float) row * this->yScale; - - const int x1 = static_cast(std::floor(srcX)); - const int y1 = static_cast(std::floor(srcY)); - - int x2 = std::min(x1 + 1, this->inputWidth - 1); - int y2 = std::min(y1 + 1, this->inputHeight - 1); - - float dx = std::max((float) srcX - (float) x1, 0.0f); - float dy = std::max((float) srcY - (float) y1, 0.0f); - - auto row1 = reinterpret_cast(src8 + y1 * this->srcStride); - auto row2 = reinterpret_cast(src8 + y2 * this->srcStride); - - for (int c = 0; c < components; ++c) { - auto c1 = static_cast(row1[x1 * components + c]); - auto c2 = static_cast(row1[x2 * components + c]); - auto c3 = static_cast(row2[x1 * components + c]); - auto c4 = static_cast(row2[x2 * components + c]); - - float result = blerp(c1, c2, c3, c4, dx, dy); - float f = result; - f = std::clamp(::roundf(f), 0.0f, maxColors); - dst[x * components + c] = static_cast(f); - } - } - } - - private: - const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f; -}; - -template -class BilinearRowSamplerF16Bit : public ScaleRowSampler { - public: - BilinearRowSamplerF16Bit(const uint16_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint16_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - - } - - ~BilinearRowSamplerF16Bit() override = default; - - void sample(const int y) override { -#if BILINEAR_ENABLE_HWY - const FixedTag dfx4; - const FixedTag dix4; - const FixedTag df16x4; - using VI4 = Vec; - using VF4 = Vec; - using VF16x4 = Vec; - - const int shift[4] = {0, 1, 2, 3}; - const VI4 shiftV = LoadU(dix4, shift); - const VF4 xScaleV = Set(dfx4, this->xScale); - const VF4 yScaleV = Set(dfx4, this->yScale); - const VI4 addOne = Set(dix4, 1); - const VF4 fOneV = Set(dfx4, 1.0f); - const VI4 maxWidth = Set(dix4, this->inputWidth - 1); - const VI4 maxHeight = Set(dix4, this->inputHeight - 1); - const VF4 vfZeros = Zero(dfx4); - const VI4 srcStrideV = Set(dix4, this->srcStride); -#endif - - const auto src8 = reinterpret_cast(this->mSource); - auto dst16 = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - const int components = Components; - - uint32_t x = 0; - -#if BILINEAR_ENABLE_HWY -#if !NOACCELERATED_SAMPLER - for (; x + 8 < this->outputWidth && components == 4; ++x) { - VI4 currentX = Set(dix4, x); - VI4 currentXV = Add(currentX, shiftV); - VF4 currentXVF = Mul(ConvertTo(dfx4, currentXV), xScaleV); - VF4 currentYVF = Mul(ConvertTo(dfx4, Set(dix4, y)), yScaleV); - - VI4 xi1 = ConvertTo(dix4, Floor(currentXVF)); - VI4 yi1 = Min(ConvertTo(dix4, Floor(currentYVF)), maxHeight); - - VI4 xi2 = Min(Add(xi1, addOne), maxWidth); - VI4 yi2 = Min(Add(yi1, addOne), maxHeight); - - VI4 row1Add = Mul(yi1, srcStrideV); - VI4 row2Add = Mul(yi2, srcStrideV); - - VF4 dx = Max(Sub(currentXVF, ConvertTo(dfx4, xi1)), vfZeros); - VF4 dy = Max(Sub(currentYVF, ConvertTo(dfx4, yi1)), vfZeros); - - #if defined(__clang__) - #pragma clang loop unroll(full) - #endif - for (int i = 0; i < 4; i++) { - auto row1 = reinterpret_cast(src8 + ExtractLane(row1Add, i)); - auto row2 = reinterpret_cast(src8 + ExtractLane(row2Add, i)); - VF16x4 lane = LoadU(df16x4, &row1[ExtractLane(xi1, i) * components]); - VF4 c1 = PromoteTo(dfx4, lane); - lane = LoadU(df16x4, &row1[ExtractLane(xi2, i) * components]); - VF4 c2 = PromoteTo(dfx4, lane); - lane = LoadU(df16x4, &row2[ExtractLane(xi1, i) * components]); - VF4 c3 = PromoteTo(dfx4, lane); - lane = LoadU(df16x4, &row2[ExtractLane(xi2, i) * components]); - VF4 c4 = PromoteTo(dfx4, lane); - VF4 value = Blerp(dfx4, c1, c2, c3, c4, Set(dfx4, ExtractLane(dx, i)), - Set(dfx4, ExtractLane(dy, i))); - VF16x4 pixel = DemoteTo(df16x4, Max(value, vfZeros)); - auto u8Store = reinterpret_cast(&dst16[ExtractLane(currentXV, i) * components]); - StoreU(pixel, df16x4, u8Store); - } - - x += components - 1; - } -#endif -#endif - - for (; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - int x1 = static_cast(srcX); - int y1 = static_cast(srcY); - - int x2 = std::min(x1 + 1, this->inputWidth - 1); - int y2 = std::min(y1 + 1, this->inputHeight - 1); - - float dx = static_cast(srcX) - static_cast(x1); - float dy = static_cast(srcY) - static_cast(y1); - - auto row1 = reinterpret_cast(src8 + y1 * this->srcStride); - auto row2 = reinterpret_cast(src8 + y2 * this->srcStride); - - const int px = x * components; - - for (int c = 0; c < components; ++c) { - float c1 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x1 * components + c])); - float c2 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x2 * components + c])); - float c3 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x1 * components + c])); - float c4 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x2 * components + c])); - float result = blerp(c1, c2, c3, c4, dx, dy); - dst16[px + c] = hwy::F16FromF32(result).bits; - } - } - } - - private: - const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f; -}; - -template -class BilinearRowSamplerAnyBit : public ScaleRowSampler { - public: - BilinearRowSamplerAnyBit(const T *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - T *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - - } - - ~BilinearRowSamplerAnyBit() = default; - - void sample(const int row) { - auto dst8 = reinterpret_cast(reinterpret_cast(this->mDestination) + row * this->dstStride); - auto dst = reinterpret_cast(dst8); - - auto *src8 = reinterpret_cast(this->mSource); - - const int components = Components; - - for (int x = 0; x < this->outputWidth; ++x) { - const float srcX = (float) x * this->xScale; - const float srcY = (float) row * this->yScale; - - const int x1 = static_cast(::floorf(srcX)); - const int y1 = static_cast(::floorf(srcY)); - - int x2 = std::min(x1 + 1, this->inputWidth - 1); - int y2 = std::min(y1 + 1, this->inputHeight - 1); - - float dx = std::max((float) srcX - (float) x1, 0.0f); - float dy = std::max((float) srcY - (float) y1, 0.0f); - - auto row1 = reinterpret_cast(src8 + y1 * this->srcStride); - auto row2 = reinterpret_cast(src8 + y2 * this->srcStride); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - auto c1 = static_cast(row1[x1 * components + c]); - auto c2 = static_cast(row1[x2 * components + c]); - auto c3 = static_cast(row2[x1 * components + c]); - auto c4 = static_cast(row2[x2 * components + c]); - - float result = blerp(c1, c2, c3, c4, dx, dy); - float f = result; - f = std::clamp(::roundf(f), 0.0f, maxColors); - dst[0] = static_cast(f); - dst += 1; - } - } - } - - private: - const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f; -}; - -class BilinearRowSampler10Bit : public ScaleRowSampler { - public: - BilinearRowSampler10Bit(const uint32_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint32_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, srcStride, inputWidth, inputHeight, - mDestination, dstStride, outputWidth, outputHeight) { - - } - - ~BilinearRowSampler10Bit() override = default; - - void sample(const int y) override { - const auto src8 = reinterpret_cast(this->mSource); - auto dst16 = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - for (int x = 0; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - int x1 = static_cast(srcX); - int y1 = static_cast(srcY); - - int x2 = std::min(x1 + 1, this->inputWidth - 1); - int y2 = std::min(y1 + 1, this->inputHeight - 1); - - float dx = static_cast(srcX) - static_cast(x1); - float dy = static_cast(srcY) - static_cast(y1); - - auto row1 = reinterpret_cast(src8 + y1 * this->srcStride); - auto row2 = reinterpret_cast(src8 + y2 * this->srcStride); - - auto c1 = static_cast(row1[x1]); - auto c2 = static_cast(row1[x2]); - auto c3 = static_cast(row2[x1]); - auto c4 = static_cast(row2[x2]); - - float r1, g1, b1, a1; - float r2, g2, b2, a2; - float r3, g3, b3, a3; - float r4, g4, b4, a4; - - parseToFloat(c1, r1, g1, b1, a1); - parseToFloat(c2, r2, g2, b2, a2); - parseToFloat(c3, r3, g3, b3, a3); - parseToFloat(c4, r4, g4, b4, a4); - - float rInter = blerp(r1, r2, r3, r4, dx, dy); - float gInter = blerp(g1, g2, g3, g4, dx, dy); - float bInter = blerp(b1, b2, b3, b4, dx, dy); - float aInter = blerp(a1, a2, a3, a4, dx, dy); - - auto R10 = static_cast(std::clamp(::roundf(rInter * maxColors), 0.0f, (float) maxColors)); - auto G10 = static_cast(std::clamp(::roundf(gInter * maxColors), 0.0f, (float) maxColors)); - auto B10 = static_cast(std::clamp(::roundf(bInter * maxColors), 0.0f, (float) maxColors)); - auto A10 = static_cast(std::clamp(::roundf(aInter * 3.f), 0.0f, 3.0f)); - - dst16[0] = (A10 << 30) | (B10 << 20) | (G10 << 10) | R10; - dst16 += 1; - } - } - - private: - const float maxColors = ::powf(2.0f, (float) 10.f) - 1.0f; - - static inline void parseToFloat(const uint32_t rgba1010102, float &r, float &g, float &b, float &a) { - const uint32_t scalarMask = (1u << 10u) - 1u; - constexpr float colorsScale = 1.f / 1023.f; - constexpr float alphaScale = 1.f / 3.f; - uint32_t r1 = (rgba1010102) & scalarMask; - uint32_t g1 = (rgba1010102 >> 10) & scalarMask; - uint32_t b1 = (rgba1010102 >> 20) & scalarMask; - uint32_t a1 = (rgba1010102 >> 30) * 3; - float rFloat = static_cast(r1) * colorsScale; - float gFloat = static_cast(g1) * colorsScale; - float bFloat = static_cast(b1) * colorsScale; - float aFloat = static_cast(a1) * alphaScale; - - r = rFloat; - g = gFloat; - b = bFloat; - a = aFloat; - } -}; - -} // sparkyuv -HWY_AFTER_NAMESPACE(); - -#undef BILINEAR_ENABLE_HWY - -#endif //SPARKYUV_BILINEAR_ROW_SAMPLER diff --git a/src/sampler/BoxRowSampler-inl.h b/src/sampler/BoxRowSampler-inl.h deleted file mode 100644 index 17d7a2e..0000000 --- a/src/sampler/BoxRowSampler-inl.h +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(SPARKYUV_BOX_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE) -#ifdef SPARKYUV_BOX_ROW_SAMPLER -#undef SPARKYUV_BOX_ROW_SAMPLER -#else -#define SPARKYUV_BOX_ROW_SAMPLER -#endif - -#include "hwy/highway.h" -#include "ScaleRowSampler.hpp" -#include "sampler-inl.h" -#include "sampler.h" -#include "../sparkyuv-internal.h" -#include -#include -#include - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { - -using namespace hwy; -using namespace hwy::HWY_NAMESPACE; - -template -class BoxSampler : public ScaleRowSampler { - public: - BoxSampler(const T *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - T *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - - } - - ~BoxSampler() = default; - - void sample(const int row) override { - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + row * this->dstStride); - - auto src8 = reinterpret_cast(this->mSource); - - const int components = Components; -#if SPARKYUV_ALLOW_FLOAT16 - const FixedTag df16; - const Half dhf16; - const auto v16Scale = Set(dhf16, hwy::F16FromF32(1.f / 4.f)); -#endif - - uint32_t x = 0; - if (PixelType == sparkyuv::BOX_UINT16 && components == 4) { - for (; x + 2 < this->outputWidth; ++x) { - auto srcX = static_cast(x * this->xScale); - auto srcY = static_cast(row * this->yScale); - - const int x1 = static_cast(::floorf(srcX)); - const int y1 = static_cast(::floorf(srcY)); - - const int y2 = std::min(y1 + 1, this->inputHeight - 1); - - const FixedTag d; - const Half dh; - const RepartitionToWide d32; - const auto row1 = LoadU(d, &reinterpret_cast(src8 + y1 * this->srcStride)[x1*4]); - const auto row2 = LoadU(d, &reinterpret_cast(src8 + y2 * this->srcStride)[x1*4]); - const auto row1Upper = PromoteUpperTo(d32, row1); - const auto row2Upper = PromoteUpperTo(d32, row2); - const auto row1Lower = PromoteLowerTo(d32, row1); - const auto row2Lower = PromoteLowerTo(d32, row2); - const auto newWidePX = ShiftRight<2>(Add(Add(Add(row1Lower, row1Upper), row2Lower), row2Upper)); - const auto newPX = DemoteTo(dh, newWidePX); - StoreU(newPX, dh, reinterpret_cast(dst)); - - dst += 4; - } - } else if (PixelType == sparkyuv::BOX_UINT8) { - for (; x + 2 < this->outputWidth; ++x) { - auto srcX = static_cast(x * this->xScale); - auto srcY = static_cast(row * this->yScale); - - const int x1 = static_cast(::floorf(srcX)); - const int y1 = static_cast(::floorf(srcY)); - - const int y2 = std::min(y1 + 1, this->inputHeight - 1); - - const FixedTag d; - const Half dh; - const Rebind d16; - const RepartitionToWide d32; - const auto row1 = LoadU(d, &reinterpret_cast(src8 + y1 * this->srcStride)[x1*4]); - const auto row2 = LoadU(d, &reinterpret_cast(src8 + y2 * this->srcStride)[x1*4]); - const auto sums = AddWide(d16, row1, row2); - const auto newWidePX = ShiftRightNarrow<2>(d32, SumsOf2(sums)); - const auto newPX = DemoteTo(dh, newWidePX); - StoreU(newPX, dh, reinterpret_cast(dst)); - dst += 4; - } - } else if (PixelType == sparkyuv::BOX_FLOAT16) { - for (; x + 2 < this->outputWidth; ++x) { - auto srcX = static_cast(x * this->xScale); - auto srcY = static_cast(row * this->yScale); - - const int x1 = static_cast(::floorf(srcX)); - const int y1 = static_cast(::floorf(srcY)); - - const int y2 = std::min(y1 + 1, this->inputHeight - 1); - -#if SPARKYUV_ALLOW_FLOAT16 - const auto row1 = LoadU(df16, &reinterpret_cast(src8 + y1 * this->srcStride)[x1*4]); - const auto row2 = LoadU(df16, &reinterpret_cast(src8 + y2 * this->srcStride)[x1*4]); - const auto newWidePX = Mul(DemoteTo(df16, SumsOf2(Add(row1, row2))), v16Scale); - const auto newPX = newWidePX; - StoreU(newPX, dhf16, reinterpret_cast(dst)); -#else - const FixedTag d; - const Half dh; - const RepartitionToWide d32; - const Rebind f32; - const Rebind f16; - const auto vScale = Set(f32, 1.f / 4.f); - const auto row1 = LoadU(d, &reinterpret_cast(src8 + y1 * this->srcStride)[x1*4]); - const auto row2 = LoadU(d, &reinterpret_cast(src8 + y2 * this->srcStride)[x1*4]); - const auto row1Upper = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteUpperTo(d32, row1)))); - const auto row2Upper = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteUpperTo(d32, row2)))); - const auto row1Lower = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteLowerTo(d32, row1)))); - const auto row2Lower = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteLowerTo(d32, row2)))); - const auto newWidePX = DemoteTo(f16, Mul(Add(Add(Add(row1Lower, row1Upper), row2Lower), row2Upper), vScale)); - const auto newPX = BitCast(dh, newWidePX); - StoreU(newPX, dh, reinterpret_cast(dst)); -#endif - - dst += 4; - } - } - - for (; x < this->outputWidth; ++x) { - const float srcX = (float) x * this->xScale; - const float srcY = (float) row * this->yScale; - - const int x1 = static_cast(::floorf(srcX)); - const int y1 = static_cast(::floorf(srcY)); - - int x2 = std::min(x1 + 1, this->inputWidth - 1); - int y2 = std::min(y1 + 1, this->inputHeight - 1); - - auto row1 = reinterpret_cast(src8 + y1 * this->srcStride); - auto row2 = reinterpret_cast(src8 + y2 * this->srcStride); - - if (PixelType != sparkyuv::BOX_RGBA1010102 && PixelType != sparkyuv::BOX_FLOAT16) { -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - auto c1 = static_cast(row1[x1 * components + c]); - auto c2 = static_cast(row1[x2 * components + c]); - auto c3 = static_cast(row2[x1 * components + c]); - auto c4 = static_cast(row2[x2 * components + c]); - - uint32_t result = (c1 + c2 + c3 + c4) >> 2; - dst[0] = static_cast(result); - dst += 1; - } - } else if (PixelType == sparkyuv::BOX_FLOAT16) { -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - constexpr float scale = 1.f / 4.f; - auto c1 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x1 * components + c])); - auto c2 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x2 * components + c])); - auto c3 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x1 * components + c])); - auto c4 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x2 * components + c])); - - float result = (c1 + c2 + c3 + c4) * scale; - dst[0] = static_cast(hwy::F16FromF32(result).bits); - dst += 1; - } - } else if (PixelType == sparkyuv::BOX_RGBA1010102) { - uint32_t p1 = reinterpret_cast(row1)[x1]; - uint32_t p2 = reinterpret_cast(row1)[x2]; - uint32_t p3 = reinterpret_cast(row2)[x1]; - uint32_t p4 = reinterpret_cast(row2)[x2]; - - uint32_t r1, g1, b1, a1; - uint32_t r2, g2, b2, a2; - uint32_t r3, g3, b3, a3; - uint32_t r4, g4, b4, a4; - - sparse1010102(p1, r1, g1, b1, a1); - sparse1010102(p2, r2, g2, b2, a2); - sparse1010102(p3, r3, g3, b3, a3); - sparse1010102(p4, r4, g4, b4, a4); - - uint32_t r = (r1 + r2 + r3 + r4) >> 2; - uint32_t g = (g1 + g2 + g3 + g4) >> 2; - uint32_t b = (b1 + b2 + b3 + b4) >> 2; - uint32_t a = (a1 + a2 + a3 + a4) >> 2; - - reinterpret_cast(dst)[0] = (a << 30) | (b << 20) | (g << 10) | r; - - if (std::is_same::value) { - dst += 4; - } else if (std::is_same::value) { - dst += 1; - } - } - } - } - - inline void sparse1010102(const uint32_t rgba1010102, uint32_t &r, uint32_t &g, uint32_t &b, uint32_t &a) { - constexpr uint32_t scalarMask = (1u << 10u) - 1u; - uint32_t r1 = (rgba1010102) & scalarMask; - uint32_t g1 = (rgba1010102 >> 10) & scalarMask; - uint32_t b1 = (rgba1010102 >> 20) & scalarMask; - uint32_t a1 = (rgba1010102 >> 30) * 3; - - r = r1; - g = g1; - b = b1; - a = a1; - } -}; - -} -HWY_AFTER_NAMESPACE(); - -#endif \ No newline at end of file diff --git a/src/sampler/NearestRowSampler-inl.hpp b/src/sampler/NearestRowSampler-inl.hpp deleted file mode 100644 index 8ae214b..0000000 --- a/src/sampler/NearestRowSampler-inl.hpp +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(SPARKYUV_NEAREST_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE) -#ifdef SPARKYUV_NEAREST_ROW_SAMPLER -#undef SPARKYUV_NEAREST_ROW_SAMPLER -#else -#define SPARKYUV_NEAREST_ROW_SAMPLER -#endif - -#include "hwy/highway.h" -#include "ScaleRowSampler.hpp" -#include "sampler.h" -#include -#include -#include - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { - -using namespace sparkyuv; - -template -class NearestRowSampler : public ScaleRowSampler { - public: - NearestRowSampler(const uint8_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint8_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - - } - - void sample(const int row) override { - auto dst = reinterpret_cast(this->mDestination + row * this->dstStride); - if (components == 4) { - for (uint32_t x = 0; x < this->outputWidth; ++x) { - auto srcX = static_cast(x * this->xScale); - auto srcY = static_cast(row * this->yScale); - - const int x1 = std::clamp(static_cast(::floorf(srcX)), 0, this->inputWidth - 1); - const int y1 = std::clamp(static_cast(::floorf(srcY)), 0, this->inputHeight - 1); - auto srcRow = reinterpret_cast(this->mSource + y1 * this->srcStride); - uint32_t px = reinterpret_cast(srcRow)[x1]; - reinterpret_cast(dst)[x] = px; - } - } else { - for (uint32_t x = 0; x < this->outputWidth; ++x) { - auto srcX = static_cast(x * this->xScale); - auto srcY = static_cast(row * this->yScale); - - const int x1 = std::clamp(static_cast(::floorf(srcX)), 0, this->inputWidth - 1); - const int y1 = std::clamp(static_cast(::floorf(srcY)), 0, this->inputHeight - 1); - auto srcRow = reinterpret_cast(this->mSource + y1 * this->srcStride); - auto srcPtr = &srcRow[x1 * components]; - std::copy(srcPtr, srcPtr + sizeof(uint8_t) * components, &dst[x * components]); - } - } - } - - ~NearestRowSampler() override = default; - - private: - const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f; -}; - -template -class NearestRowSampler16Bit : public ScaleRowSampler { - public: - NearestRowSampler16Bit(const uint16_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint16_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - - } - - void sample(const int row) override { - const int components = Components; - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + row * this->dstStride); - for (int x = 0; x < this->outputWidth; ++x) { - const float srcX = (float) x * this->xScale; - const float srcY = (float) row * this->yScale; - - const int x1 = std::clamp(static_cast(::floorf(srcX)), 0, this->inputWidth - 1); - const int y1 = std::clamp(static_cast(::floorf(srcY)), 0, this->inputHeight - 1); - auto srcRow = - reinterpret_cast(reinterpret_cast(this->mSource) + y1 * this->srcStride); - auto srcPtr = &srcRow[x1 * components]; - std::copy(srcPtr, srcPtr + sizeof(uint8_t) * components, &dst[x * components]); - } - } - - ~NearestRowSampler16Bit() override = default; - -}; - -class NearestRowSampler10Bit : public ScaleRowSampler { - public: - NearestRowSampler10Bit(const uint32_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint32_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - - } - - void sample(const int row) override { - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + row * this->dstStride); - for (int x = 0; x < this->outputWidth; ++x) { - const float srcX = (float) x * xScale; - const float srcY = (float) row * yScale; - - const int x1 = std::clamp(static_cast(::floorf(srcX)), 0, inputWidth - 1); - const int y1 = std::clamp(static_cast(::floorf(srcY)), 0, inputHeight - 1); - auto srcRow = reinterpret_cast(reinterpret_cast(mSource) + y1 * srcStride); - dst[x] = srcRow[x1]; - } - } - - ~NearestRowSampler10Bit() override = default; - - private: -}; - -} -HWY_AFTER_NAMESPACE(); - -#endif diff --git a/src/sampler/ScaleRowSampler.hpp b/src/sampler/ScaleRowSampler.hpp deleted file mode 100644 index 46cc295..0000000 --- a/src/sampler/ScaleRowSampler.hpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace sparkyuv { - -typedef float (*ScaleWeightSampler)(float); - -template -class ScaleRowSampler { - public: - ScaleRowSampler(const T *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - T *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : mSource(mSource), - srcStride(srcStride), - inputWidth(inputWidth), - inputHeight(inputHeight), - mDestination(mDestination), - dstStride(dstStride), - outputWidth(outputWidth), - outputHeight(outputHeight) { - xScale = static_cast(inputWidth) / static_cast(outputWidth); - yScale = static_cast(inputHeight) / static_cast(outputHeight); - } - - virtual void sample(int row) = 0; - - virtual ~ScaleRowSampler() = default; - - public: - const T *mSource; - const int srcStride; - const int inputWidth; - const int inputHeight; - T *mDestination; - const int dstStride; - const int outputWidth; - const int outputHeight; - - float xScale; - float yScale; -}; -} \ No newline at end of file diff --git a/src/sampler/Window4RowSampler-inl.hpp b/src/sampler/Window4RowSampler-inl.hpp deleted file mode 100644 index 7307bfb..0000000 --- a/src/sampler/Window4RowSampler-inl.hpp +++ /dev/null @@ -1,686 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(SPARKYUV_WINDOW4_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE) -#ifdef SPARKYUV_WINDOW4_ROW_SAMPLER -#undef SPARKYUV_WINDOW4_ROW_SAMPLER -#else -#define SPARKYUV_WINDOW4_ROW_SAMPLER -#endif - -#include -#include -#include -#include "ScaleRowSampler.hpp" -#include "src/sampler/sampler.h" -#include - -#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128 -#define WEIGHTED_WINDOW4_HWY 1 -#else -#define WEIGHTED_WINDOW4_HWY 0 -#endif - -#if WEIGHTED_WINDOW4_HWY -#include "src/sampler/sampler-inl.h" -#endif - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { - -using namespace hwy; -using namespace hwy::HWY_NAMESPACE; -using namespace sparkyuv; - -enum WeightedRow4Operation { - WEIGHTED_ROW4_HERMITE, - WEIGHTED_ROW4_CATMULL_ROM, - WEIGHTED_ROW4_BSPLINE, - WEIGHTED_ROW4_CUBIC, - WEIGHTED_ROW4_BICUBIC, - WEIGHTED_ROW4_MITCHELL -}; - -template -class WeightedWindow4RowSampler : public ScaleRowSampler { - public: - WeightedWindow4RowSampler(const uint8_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint8_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, srcStride, - inputWidth, inputHeight, - mDestination, dstStride, - outputWidth, outputHeight) { - switch (op) { - case WEIGHTED_ROW4_HERMITE: { - sampler = CubicHermite; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = CubicHermiteV; -#endif - } - break; - case WEIGHTED_ROW4_CATMULL_ROM: { - sampler = CatmullRom; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = CatmullRomV; -#endif - } - break; - case WEIGHTED_ROW4_BSPLINE: { - sampler = BSpline; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = CubicBSplineV; -#endif - } - break; - case WEIGHTED_ROW4_CUBIC: { - sampler = SimpleCubic; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = SimpleCubicV; -#endif - } - break; - case WEIGHTED_ROW4_BICUBIC: { - sampler = BiCubicSpline; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = BiCubicSplineV; -#endif - } - break; - case WEIGHTED_ROW4_MITCHELL: { - sampler = MitchellNetravalli; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = MitchellNetravaliV; -#endif - } - break; - } - } - - void sample(const int y) override { -#if WEIGHTED_WINDOW4_HWY - const FixedTag dfx4; - const FixedTag dix4; - const FixedTag dux4; - const FixedTag du8x4; - using VI4 = Vec; - using VF4 = Vec; - using VU8x4 = Vec; - const VF4 vfZeros = Zero(dfx4); - const VF4 maxColorsV = Set(dfx4, maxColors); -#endif - - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - const int components = Components; - - uint32_t x = 0; - -#if WEIGHTED_WINDOW4_HWY -#if !NOACCELERATED_SAMPLER - for (; x + 8 < this->outputWidth && components == 4; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - // only kernel with size 2 is supported - constexpr int kernelSize = 2; - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - VF4 color = Set(dfx4, 0); - - const int a = kernelSize; - const int mMaxWidth = this->inputWidth - 1; - - const int appendixLow[4] = {-1, 0, 1, 2}; - - VF4 srcXV = Set(dfx4, srcX); - VI4 kx1V = Set(dix4, kx1); - const VI4 appendixLowV = LoadU(dix4, appendixLow); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - auto row = reinterpret_cast(this->mSource - + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - VF4 yWeightV = Set(dfx4, yWeight); - VI4 xi = Add(kx1V, appendixLowV); - VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi)); - VF4 weights = Mul(samplerHWY(dfx4, dx), yWeightV); - for (int i = 0; i < components; ++i) { - int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components; - VU8x4 u81 = LoadU(du8x4, reinterpret_cast(&row[sizeXPos])); - VF4 fr1 = ConvertTo(dfx4, PromoteTo(dix4, u81)); - fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i))); - color = Add(color, fr1); - } - } - - color = ClampRound(dfx4, color, vfZeros, maxColorsV); - VU8x4 u8Color = DemoteTo(du8x4, ConvertTo(dux4, color)); - StoreU(u8Color, du8x4, reinterpret_cast(&dst[x * components])); - } -#endif -#endif - - for (; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - int a = 2; - - float rgb[components]; - std::fill(rgb, rgb + components, 0.0f); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - - auto row = reinterpret_cast(this->mSource - + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - - const int px = std::clamp(xi, 0, this->inputWidth - 1) * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - auto clrf = static_cast(row[px + c]); - float clr = clrf * weight; - rgb[c] += clr; - } - } - } - - const int px = x * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - dst[px + c] = static_cast(std::clamp(::roundf(rgb[c]), 0.f, maxColors)); - } - } - } - - ~WeightedWindow4RowSampler() override = default; - - private: - const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f; - ScaleWeightSampler sampler; - -#if WEIGHTED_WINDOW4_HWY - typedef Vec> (*ScaleWeightSamplerHWY)(FixedTag, Vec>); - ScaleWeightSamplerHWY samplerHWY; -#endif -}; - -template -class WeightedWindow4RowSampler16Bit : public ScaleRowSampler { - public: - WeightedWindow4RowSampler16Bit(const uint16_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint16_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight, - const int depth) : - ScaleRowSampler(mSource, srcStride, - inputWidth, inputHeight, - mDestination, dstStride, - outputWidth, outputHeight), - maxColors(::powf(2.0f, static_cast(depth)) - 1.0f) { - switch (op) { - case WEIGHTED_ROW4_HERMITE: { - sampler = CubicHermite; - } - break; - case WEIGHTED_ROW4_CATMULL_ROM: { - sampler = CatmullRom; - } - break; - case WEIGHTED_ROW4_BSPLINE: { - sampler = BSpline; - } - break; - case WEIGHTED_ROW4_CUBIC: { - sampler = SimpleCubic; - } - break; - case WEIGHTED_ROW4_BICUBIC: { - sampler = BiCubicSpline; - } - break; - case WEIGHTED_ROW4_MITCHELL: { - sampler = MitchellNetravalli; - } - break; - } - } - - void sample(const int y) override { - - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - const int components = Components; - - for (int x = 0; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - int a = 2; - - float rgb[components]; - std::fill(rgb, rgb + components, 0.0f); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - - auto row = reinterpret_cast(reinterpret_cast(this->mSource) - + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - - const int px = std::clamp(xi, 0, this->inputWidth - 1) * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - auto clrf = static_cast(row[px + c]); - float clr = clrf * weight; - rgb[c] += clr; - } - } - } - - const int px = x * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - dst[px + c] = static_cast(std::clamp(::roundf(rgb[c]), 0.f, maxColors)); - } - } - } - - ~WeightedWindow4RowSampler16Bit() override = default; - - private: - const float maxColors; - ScaleWeightSampler sampler; -}; - -template -class WeightedWindow4RowSampler10Bit : public ScaleRowSampler { - public: - WeightedWindow4RowSampler10Bit(const uint32_t *mSource, const int srcStride, - const int inputWidth, const int inputHeight, - uint32_t *mDestination, const int dstStride, - const int outputWidth, const int outputHeight) : - ScaleRowSampler(mSource, srcStride, - inputWidth, inputHeight, - mDestination, dstStride, - outputWidth, outputHeight) { - switch (op) { - case WEIGHTED_ROW4_HERMITE: { - sampler = CubicHermite; - } - break; - case WEIGHTED_ROW4_CATMULL_ROM: { - sampler = CatmullRom; - } - break; - case WEIGHTED_ROW4_BSPLINE: { - sampler = BSpline; - } - break; - case WEIGHTED_ROW4_CUBIC: { - sampler = SimpleCubic; - } - break; - case WEIGHTED_ROW4_BICUBIC: { - sampler = BiCubicSpline; - } - break; - case WEIGHTED_ROW4_MITCHELL: { - sampler = MitchellNetravalli; - } - break; - } - } - - void sample(const int y) override { - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - for (int x = 0; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - const int a = 2; - - float rgb[4] = {0, 0, 0, 0}; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - - auto row = reinterpret_cast(reinterpret_cast(this->mSource) + - std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - - const int px = std::clamp(xi, 0, this->inputWidth - 1); - - uint32_t color = row[px]; - - float r = 0, g = 0, b = 0, aAlpha = 0; - parseToFloat(color, r, g, b, aAlpha); - - rgb[0] += r * weight; - rgb[1] += g * weight; - rgb[2] += b * weight; - rgb[3] += aAlpha * weight; - } - } - - auto R10 = static_cast(std::clamp(::roundf(rgb[0] * maxColors), 0.0f, (float) maxColors)); - auto G10 = static_cast(std::clamp(::roundf(rgb[1] * maxColors), 0.0f, (float) maxColors)); - auto B10 = static_cast(std::clamp(::roundf(rgb[2] * maxColors), 0.0f, (float) maxColors)); - auto A10 = static_cast(std::clamp(::roundf(rgb[3] * 3.f), 0.0f, 3.0f)); - - dst[x] = (A10 << 30) | (B10 << 20) | (G10 << 10) | R10; - } - } - - ~WeightedWindow4RowSampler10Bit() override = default; - - private: - - const float maxColors = ::powf(2.0f, (float) 10.f) - 1.0f; - ScaleWeightSampler sampler; - - inline void parseToFloat(const uint32_t rgba1010102, float &r, float &g, float &b, float &a) { - const uint32_t scalarMask = (1u << 10u) - 1u; - uint32_t b1 = (rgba1010102) & scalarMask; - uint32_t g1 = (rgba1010102 >> 10) & scalarMask; - uint32_t r1 = (rgba1010102 >> 20) & scalarMask; - uint32_t a1 = (rgba1010102 >> 30) * 3; - constexpr float colorScale = 1.f / 1023.f; - constexpr float alphaScale = 1.f / 3.f; - float rFloat = static_cast(r1) * colorScale; - float gFloat = static_cast(g1) * colorScale; - float bFloat = static_cast(b1) * colorScale; - float aFloat = static_cast(a1) * alphaScale; - - r = rFloat; - g = gFloat; - b = bFloat; - a = aFloat; - } -}; - -template -class WeightedWindow4RowSamplerF16Bit : public ScaleRowSampler { - public: - WeightedWindow4RowSamplerF16Bit(const uint16_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint16_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, srcStride, - inputWidth, inputHeight, - mDestination, dstStride, - outputWidth, outputHeight) { - switch (op) { - case WEIGHTED_ROW4_HERMITE: { - sampler = CubicHermite; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = CubicHermiteV; -#endif - } - break; - case WEIGHTED_ROW4_CATMULL_ROM: { - sampler = CatmullRom; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = CatmullRomV; -#endif - } - break; - case WEIGHTED_ROW4_BSPLINE: { - sampler = BSpline; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = CubicBSplineV; -#endif - } - break; - case WEIGHTED_ROW4_CUBIC: { - sampler = SimpleCubic; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = SimpleCubicV; -#endif - } - break; - case WEIGHTED_ROW4_BICUBIC: { - sampler = BiCubicSpline; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = BiCubicSplineV; -#endif - } - break; - case WEIGHTED_ROW4_MITCHELL: { - sampler = MitchellNetravalli; -#if WEIGHTED_WINDOW4_HWY - samplerHWY = MitchellNetravaliV; -#endif - } - break; - } - } - - void sample(const int y) override { -#if WEIGHTED_WINDOW4_HWY - const FixedTag dfx4; - const FixedTag dix4; - const FixedTag df16x4; - using VI4 = Vec; - using VF4 = Vec; - using VF16x4 = Vec; - const int mMaxWidth = this->inputWidth - 1; -#endif - - const auto src8 = reinterpret_cast(this->mSource); - auto dst16 = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - const int components = Components; - - uint32_t x = 0; - -#if WEIGHTED_WINDOW4_HWY -#if !NOACCELERATED_SAMPLER - for (; x + 8 < this->outputWidth && components == 4; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - const int a = 2; - float rgb[components]; - fill(rgb, rgb + components, 0.0f); - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - VF4 color = Set(dfx4, 0); - - const int appendixLow[4] = {-1, 0, 1, 2}; - - VF4 srcXV = Set(dfx4, srcX); - VI4 kx1V = Set(dix4, kx1); - const VI4 appendixLowV = LoadU(dix4, appendixLow); - - #if defined(__clang__) - #pragma clang loop unroll(full) - #endif - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - auto row = - reinterpret_cast(src8 + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - VF4 yWeightV = Set(dfx4, yWeight); - VI4 xi = Add(kx1V, appendixLowV); - VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi)); - VF4 weights = Mul(samplerHWY(dfx4, dx), yWeightV); - - #if defined(__clang__) - #pragma clang loop unroll(full) - #endif - for (int i = 0; i < 4; ++i) { - int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components; - VF16x4 r1 = LoadU(df16x4, reinterpret_cast(&row[sizeXPos])); - VF4 fr1 = PromoteTo(dfx4, r1); - fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i))); - color = Add(color, fr1); - } - } - - VF16x4 f16Color = DemoteTo(df16x4, color); - StoreU(f16Color, df16x4, reinterpret_cast(&dst16[x * components])); - } -#endif -#endif - - for (; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - const int a = 2; - float rgb[components]; - std::fill(rgb, rgb + components, 0.0f); - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - - auto *row = reinterpret_cast(reinterpret_cast(src8) + - std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - - const int px = std::clamp(xi, 0, this->inputWidth - 1) * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - float clrf = hwy::F32FromF16(hwy::float16_t::FromBits(row[px + c])); - float clr = (float) clrf * weight; - rgb[c] += clr; - } - } - } - - int px = x * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - float newColor = rgb[c]; - dst16[px + c] = hwy::F16FromF32(newColor).bits; - } - } - } - - ~WeightedWindow4RowSamplerF16Bit() override = default; - - private: - ScaleWeightSampler sampler; -#if WEIGHTED_WINDOW4_HWY - typedef Vec> (*ScaleWeightSamplerHWY)(FixedTag, Vec>); - ScaleWeightSamplerHWY samplerHWY; -#endif -}; - -} -HWY_AFTER_NAMESPACE(); - -#undef WEIGHTED_WINDOW4_HWY - -#endif diff --git a/src/sampler/Window6RowSampler-inl.hpp b/src/sampler/Window6RowSampler-inl.hpp deleted file mode 100644 index eff5a6f..0000000 --- a/src/sampler/Window6RowSampler-inl.hpp +++ /dev/null @@ -1,613 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#if defined(SPARKYUV_WINDOW6_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE) -#ifdef SPARKYUV_WINDOW6_ROW_SAMPLER -#undef SPARKYUV_WINDOW6_ROW_SAMPLER -#else -#define SPARKYUV_WINDOW6_ROW_SAMPLER -#endif - -#include -#include "ScaleRowSampler.hpp" -#include -#include -#include "sampler.h" -#include "src/math/math-inl.h" -#include - -#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128 -#define WEIGHTED_WINDOW6_HWY 1 -#else -#define WEIGHTED_WINDOW6_HWY 0 -#endif - -#if WEIGHTED_WINDOW6_HWY -#include "sampler-inl.h" -#endif - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { -using namespace hwy; -using namespace hwy::HWY_NAMESPACE; -using namespace sparkyuv; - -enum WeightedRow6Operation { - WEIGHTED_ROW6_LANCZOS_SINC -}; - -template -class WeightedWindow6RowSampler10Bit : public ScaleRowSampler { - public: - WeightedWindow6RowSampler10Bit(const uint32_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint32_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, - inputHeight, - mDestination, - dstStride, - outputWidth, - outputHeight) { - switch (op) { - case WEIGHTED_ROW6_LANCZOS_SINC: { - sampler = sparkyuv::Lanczos3Sinc; - } - break; - } - } - - void sample(const int y) override { - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - for (int x = 0; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - const int a = 3; - - float rgb[4] = {0, 0, 0, 0}; - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - float weightSum(0.0f); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - weightSum += weight; - - auto row = reinterpret_cast(reinterpret_cast(this->mSource) - + clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - - const int px = std::clamp(xi, 0, this->inputWidth - 1); - - uint32_t color = row[px]; - - float r = 0, g = 0, b = 0, aAlpha = 0; - parseToFloat(color, r, g, b, aAlpha); - - rgb[0] += r * weight; - rgb[1] += g * weight; - rgb[2] += b * weight; - rgb[3] += aAlpha * weight; - } - } - - const int px = x; - - if (weightSum == 0.f) { - dst[px] = 0; - } else { - float revertScale = 1.f / weightSum * maxColors; - auto R10 = static_cast(std::clamp(::roundf(rgb[0] * revertScale), 0.0f, (float) maxColors)); - auto G10 = static_cast(std::clamp(::roundf(rgb[1] * revertScale), 0.0f, (float) maxColors)); - auto B10 = static_cast(std::clamp(::roundf(rgb[2] * revertScale), 0.0f, (float) maxColors)); - auto A10 = static_cast(std::clamp(::roundf(rgb[3] / weightSum * 3.f), 0.0f, 3.0f)); - - dst[x] = (A10 << 30) | (B10 << 20) | (G10 << 10) | R10; - } - } - } - - ~WeightedWindow6RowSampler10Bit() override = default; - - private: - const float maxColors = ::powf(2.0f, (float) 10.f) - 1.0f; - ScaleWeightSampler sampler; - - inline void parseToFloat(const uint32_t rgba1010102, float &r, float &g, float &b, float &a) { - const uint32_t scalarMask = (1u << 10u) - 1u; - uint32_t b1 = (rgba1010102) & scalarMask; - uint32_t g1 = (rgba1010102 >> 10) & scalarMask; - uint32_t r1 = (rgba1010102 >> 20) & scalarMask; - uint32_t a1 = (rgba1010102 >> 30) * 3; - constexpr float colorScale = 1.f / 1023.f; - constexpr float alphaScale = 1.f / 3.f; - float rFloat = static_cast(r1) * colorScale; - float gFloat = static_cast(g1) * colorScale; - float bFloat = static_cast(b1) * colorScale; - float aFloat = static_cast(a1) * alphaScale; - - r = rFloat; - g = gFloat; - b = bFloat; - a = aFloat; - } -}; - -template -class WeightedWindow6RowSampler : public ScaleRowSampler { - public: - WeightedWindow6RowSampler(const uint8_t *mSource, const int srcStride, - const int inputWidth, const int inputHeight, - uint8_t *mDestination, const int dstStride, - const int outputWidth, const int outputHeight) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, inputHeight, - mDestination, dstStride, - outputWidth, outputHeight) { - switch (op) { - case WEIGHTED_ROW6_LANCZOS_SINC: { - sampler = sparkyuv::Lanczos3Sinc; -#if WEIGHTED_WINDOW6_HWY - samplerHWY = Lanczos3Sinc; -#endif - } - break; - } - } - - void sample(const int y) override { - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - const int components = Components; - -#if WEIGHTED_WINDOW6_HWY - const FixedTag dfx4; - const FixedTag dix4; - const FixedTag dux4; - const FixedTag du8x4; - using VI4 = Vec; - using VF4 = Vec; - using VU8x4 = Vec; - - const VF4 vfZeros = Zero(dfx4); - const VF4 maxColorsV = Set(dfx4, maxColors); -#endif - - uint32_t x = 0; - -#if WEIGHTED_WINDOW6_HWY -#if !NOACCELERATED_SAMPLER - for (; x + 8 < this->outputWidth && components == 4; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - // only kernel with size 3 is supported - constexpr int kernelSize = 3; - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - float kWeightSum = 0; - VF4 color = Set(dfx4, 0); - - const int a = kernelSize; - const int mMaxWidth = this->inputWidth - 1; - - const int appendixLow[4] = {-2, -1, 0, 1}; - const int appendixHigh[4] = {2, 3, 0, 0}; - - VF4 srcXV = Set(dfx4, srcX); - VI4 kx1V = Set(dix4, kx1); - const VI4 appendixLowV = LoadU(dix4, appendixLow); - const VI4 appendixHighV = LoadU(dix4, appendixHigh); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - auto row = reinterpret_cast(this->mSource - + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - VF4 yWeightV = Set(dfx4, yWeight); - VI4 xi = Add(kx1V, appendixLowV); - VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi)); - VF4 sampleParameter = dx; - VF4 weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV); - kWeightSum += ExtractLane(SumOfLanes(dfx4, weights), 0); -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int i = 0; i < 4; ++i) { - int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components; - VU8x4 u81 = LoadU(du8x4, reinterpret_cast(&row[sizeXPos])); - VF4 fr1 = ConvertTo(dfx4, PromoteTo(dix4, u81)); - fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i))); - color = Add(color, fr1); - } - - xi = Add(kx1V, appendixHighV); - dx = Sub(srcXV, ConvertTo(dfx4, xi)); - sampleParameter = dx; - weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV); -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int i = 0; i < 2; ++i) { - int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components; - VU8x4 u81 = LoadU(du8x4, reinterpret_cast(&row[sizeXPos])); - VF4 fr1 = ConvertTo(dfx4, PromoteTo(dix4, u81)); - float weight = ExtractLane(weights, i); - kWeightSum += weight; - fr1 = Mul(fr1, Set(dfx4, weight)); - color = Add(color, fr1); - } - } - - if (kWeightSum == 0) { - color = ClampRound(dfx4, color, vfZeros, maxColorsV); - VU8x4 u8Color = DemoteTo(du8x4, ConvertTo(dux4, color)); - StoreU(u8Color, du8x4, reinterpret_cast(&dst[x * components])); - } else { - color = ClampRound(dfx4, Div(color, Set(dfx4, kWeightSum)), vfZeros, - maxColorsV); - VU8x4 u8Color = DemoteTo(du8x4, ConvertTo(dux4, color)); - StoreU(u8Color, du8x4, reinterpret_cast(&dst[x * components])); - } - } -#endif -#endif - - for (; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - const int a = 3; - float rgb[components]; - std::fill(rgb, rgb + components, 0.0f); - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - float weightSum(0.0f); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - weightSum += weight; - - auto row = reinterpret_cast(this->mSource - + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - - const int px = std::clamp(xi, 0, this->inputWidth - 1) * components; -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - auto clrf = static_cast(row[px + c]); - float clr = clrf * weight; - rgb[c] += clr; - } - } - } - - const int px = x * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - dst[px + c] = static_cast(std::clamp(::roundf(rgb[c] * weightSum), 0.0f, maxColors)); - } - } - } - - ~WeightedWindow6RowSampler() override = default; - - private: - typedef Vec> (*ScaleWeightSamplerHWY)(FixedTag, Vec>); - - const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f; - ScaleWeightSampler sampler; - ScaleWeightSamplerHWY samplerHWY; -}; - -template -class WeightedWindow6RowSampler16Bit : public ScaleRowSampler { - public: - WeightedWindow6RowSampler16Bit(const uint16_t *mSource, const int srcStride, - const int inputWidth, const int inputHeight, - uint16_t *mDestination, const int dstStride, - const int outputWidth, const int outputHeight, - const int depth) : - ScaleRowSampler(mSource, - srcStride, - inputWidth, inputHeight, - mDestination, dstStride, - outputWidth, outputHeight), - maxColors(::powf(2.0f, static_cast(depth)) - 1.0f) { - switch (op) { - case WEIGHTED_ROW6_LANCZOS_SINC: { - sampler = sparkyuv::Lanczos3Sinc; - } - break; - } - } - - void sample(const int y) override { - auto dst = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - const int components = Components; - - for (int x = 0; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - const int a = 3; - float rgb[components]; - std::fill(rgb, rgb + components, 0.0f); - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - float weightSum(0.0f); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - weightSum += weight; - - auto row = reinterpret_cast(reinterpret_cast(this->mSource) - + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - - const int px = std::clamp(xi, 0, this->inputWidth - 1) * components; -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - auto clrf = static_cast(row[px + c]); - float clr = clrf * weight; - rgb[c] += clr; - } - } - } - - const int px = x * components; - const float invWeightScale = weightSum != 0.f ? 1.f / weightSum : 0.f; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - dst[px + c] = static_cast(std::clamp(::roundf(rgb[c] * invWeightScale), 0.0f, maxColors)); - } - } - } - - ~WeightedWindow6RowSampler16Bit() override = default; - - private: - const float maxColors; - ScaleWeightSampler sampler; -}; - -template -class WeightedWindow6RowSamplerF16Bit : public ScaleRowSampler { - public: - WeightedWindow6RowSamplerF16Bit(const uint16_t *mSource, - const int srcStride, - const int inputWidth, - const int inputHeight, - uint16_t *mDestination, - const int dstStride, - const int outputWidth, - const int outputHeight) : - ScaleRowSampler(mSource, srcStride, - inputWidth, inputHeight, - mDestination, dstStride, - outputWidth, outputHeight) { - switch (op) { - case WEIGHTED_ROW6_LANCZOS_SINC: { - sampler = sparkyuv::Lanczos3Sinc; -#if WEIGHTED_WINDOW6_HWY - samplerHWY = Lanczos3Sinc; -#endif - } - break; - } - } - - void sample(const int y) override { -#if WEIGHTED_WINDOW6_HWY - const FixedTag dfx4; - const FixedTag dix4; - const FixedTag df16x4; - using VI4 = Vec; - using VF4 = Vec; - using VF16x4 = Vec; - const int mMaxWidth = this->inputWidth - 1; -#endif - - const auto src8 = reinterpret_cast(this->mSource); - auto dst16 = reinterpret_cast(reinterpret_cast(this->mDestination) + y * this->dstStride); - - const int components = Components; - - uint32_t x = 0; - -#if WEIGHTED_WINDOW6_HWY -#if !NOACCELERATED_SAMPLER - for (; x + 8 < this->outputWidth && components == 4; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - const int a = 3; - float rgb[components]; - std::fill(rgb, rgb + components, 0.0f); - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - float kWeightSum = 0; - VF4 color = Set(dfx4, 0); - - const int appendixLow[4] = {-2, -1, 0, 1}; - const int appendixHigh[4] = {2, 3, 0, 0}; - - const VF4 aVector = Set(dfx4, a); - VF4 srcXV = Set(dfx4, srcX); - VI4 kx1V = Set(dix4, kx1); - const VI4 appendixLowV = LoadU(dix4, appendixLow); - const VI4 appendixHighV = LoadU(dix4, appendixHigh); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - auto row = - reinterpret_cast(src8 + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - VF4 yWeightV = Set(dfx4, yWeight); - VI4 xi = Add(kx1V, appendixLowV); - VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi)); - VF4 sampleParameter = dx; - VF4 weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV); - kWeightSum += ExtractLane(SumOfLanes(dfx4, weights), 0); - for (int i = 0; i < 4; ++i) { - int sizeXPos = clamp(ExtractLane(xi, i), 0, mMaxWidth) * components; - VF16x4 r1 = LoadU(df16x4, reinterpret_cast(&row[sizeXPos])); - VF4 fr1 = PromoteTo(dfx4, r1); - fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i))); - color = Add(color, fr1); - } - - xi = Add(kx1V, appendixHighV); - dx = Sub(srcXV, ConvertTo(dfx4, xi)); - sampleParameter = dx; - weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV); - for (int i = 0; i < 2; ++i) { - int sizeXPos = clamp(ExtractLane(xi, i), 0, mMaxWidth) * components; - VF16x4 r1 = LoadU(df16x4, - reinterpret_cast(&row[sizeXPos])); - VF4 fr1 = PromoteTo(dfx4, r1); - float weight = ExtractLane(weights, i); - kWeightSum += weight; - fr1 = Mul(fr1, Set(dfx4, weight)); - color = Add(color, fr1); - } - } - - if (kWeightSum == 0) { - VF16x4 f16Color = DemoteTo(df16x4, color); - StoreU(f16Color, df16x4, reinterpret_cast(&dst16[x * components])); - } else { - VF16x4 f16Color = DemoteTo(df16x4, Div(color, Set(dfx4, kWeightSum))); - StoreU(f16Color, df16x4, reinterpret_cast(&dst16[x * components])); - } - } -#endif -#endif - - for (; x < this->outputWidth; ++x) { - float srcX = (float) x * this->xScale; - float srcY = (float) y * this->yScale; - - const int a = 3; - float rgb[components]; - std::fill(rgb, rgb + components, 0.0f); - - float kx1 = ::floorf(srcX); - float ky1 = ::floorf(srcY); - - float weightSum(0.0f); - - for (int j = -a + 1; j <= a; j++) { - int yj = (int) ky1 + j; - float dy = float(srcY) - (float(ky1) + (float) j); - float yWeight = sampler(dy); - for (int i = -a + 1; i <= a; i++) { - int xi = (int) kx1 + i; - float dx = float(srcX) - (float(kx1) + (float) i); - float weight = sampler(dx) * yWeight; - weightSum += weight; - - auto row = - reinterpret_cast(src8 + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride); - - const int px = std::clamp(xi, 0, this->inputWidth - 1) * components; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - float clrf = hwy::F32FromF16(hwy::float16_t::FromBits(row[px + c])); - float clr = (float) clrf * weight; - rgb[c] += clr; - } - } - } - - const int px = x * components; - const float invWeightScale = weightSum != 0.f ? 1.f / weightSum : 0.f; - -#if defined(__clang__) -#pragma clang loop unroll(full) -#endif - for (int c = 0; c < components; ++c) { - dst16[px + c] = hwy::F16FromF32(rgb[c] * invWeightScale).bits; - } - } - } - - ~WeightedWindow6RowSamplerF16Bit() override = default; - - private: - const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f; - ScaleWeightSampler sampler; - -#if WEIGHTED_WINDOW6_HWY - typedef Vec> (*ScaleWeightSamplerHWY)(FixedTag, Vec>); - ScaleWeightSamplerHWY samplerHWY; -#endif -}; -} -HWY_AFTER_NAMESPACE(); - -#undef WEIGHTED_WINDOW6_HWY - -#endif diff --git a/src/sampler/sampler-inl.h b/src/sampler/sampler-inl.h deleted file mode 100644 index c0ab03f..0000000 --- a/src/sampler/sampler-inl.h +++ /dev/null @@ -1,406 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(SPARKYUV_SAMPLER_INL_H) == defined(HWY_TARGET_TOGGLE) -#ifdef SPARKYUV_SAMPLER_INL_H -#undef SPARKYUV_SAMPLER_INL_H -#else -#define SPARKYUV_SAMPLER_INL_H -#endif - -#include "hwy/highway.h" -#include "src/math/math-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace sparkyuv::HWY_NAMESPACE { -using hwy::HWY_NAMESPACE::Set; -using hwy::HWY_NAMESPACE::FixedTag; -using hwy::HWY_NAMESPACE::Vec; -using hwy::HWY_NAMESPACE::Abs; -using hwy::HWY_NAMESPACE::Mul; -using hwy::HWY_NAMESPACE::Div; -using hwy::HWY_NAMESPACE::Max; -using hwy::HWY_NAMESPACE::Min; -using hwy::HWY_NAMESPACE::Add; -using hwy::HWY_NAMESPACE::Zero; -using hwy::HWY_NAMESPACE::BitCast; -using hwy::HWY_NAMESPACE::ConvertTo; -using hwy::HWY_NAMESPACE::PromoteTo; -using hwy::HWY_NAMESPACE::DemoteTo; -using hwy::HWY_NAMESPACE::Combine; -using hwy::HWY_NAMESPACE::Rebind; -using hwy::HWY_NAMESPACE::Sub; -using hwy::HWY_NAMESPACE::LowerHalf; -using hwy::HWY_NAMESPACE::UpperHalf; -using hwy::HWY_NAMESPACE::LoadInterleaved4; -using hwy::HWY_NAMESPACE::StoreInterleaved4; -using hwy::HWY_NAMESPACE::IfThenZeroElse; -using hwy::float16_t; -using hwy::float32_t; - -using hwy::HWY_NAMESPACE::NegMulAdd; -using hwy::HWY_NAMESPACE::MulAdd; -using hwy::HWY_NAMESPACE::IfThenElse; -using hwy::HWY_NAMESPACE::MulSub; -using hwy::HWY_NAMESPACE::ApproximateReciprocal; - -template> -HWY_MATH_INLINE T -BCSplinePartOne(const D df, T x, const T B, const T C, const T tripled, const T doubled) { - x = Abs(x); - T mult = Set(df, 1.0f / 6.0f); - T r1 = NegMulAdd(Set(df, 9), B, NegMulAdd(Set(df, 6.0), C, Set(df, 12))); - T r2 = MulAdd(Set(df, 6), C, MulSub(Set(df, 12.0f), B, Set(df, 18.0f))); - T r3 = NegMulAdd(Set(df, 2), B, Set(df, 6)); - return Mul(MulAdd(r1, tripled, MulAdd(r2, doubled, r3)), mult); -} - -template> -HWY_MATH_INLINE T -BCSplinePartTwo(const D df, T x, const T B, const T C, const T tripled, const T doubled) { - x = Abs(x); - T mult = Set(df, 1.0f / 6.0f); - T r1 = MulSub(Set(df, -6.0f), C, B); - T r2 = MulAdd(Set(df, 6.0), B, Mul(Set(df, 30), C)); - T r3 = MulSub(Set(df, -12), B, Mul(Set(df, 48), C)); - T r4 = MulAdd(Set(df, 8.0), B, Mul(Set(df, 24.0f), C)); - T rr = MulAdd(r1, tripled, MulAdd(r2, doubled, MulAdd(r3, x, r4))); - return Mul(rr, mult); -} - -template> -HWY_MATH_INLINE V BCSpline(const D df, V x, const V B, const V C) { - x = Abs(x); - const V zeros = Zero(df); - const V ones = Set(df, 1.0); - const V two = Set(df, 2.0); - const V doubled = Mul(x, x); - const V tripled = Mul(doubled, x); - auto setZeroMask = x > two; - auto setP1Mask = x < ones; - auto setP2Mask = x >= ones; - V res = Zero(df); - const V p1 = BCSplinePartOne(df, x, B, C, tripled, doubled); - const V p2 = BCSplinePartTwo(df, x, B, C, tripled, doubled); - res = IfThenElse(setP1Mask, p1, zeros); - res = IfThenElse(setP2Mask, p2, res); - res = IfThenElse(setZeroMask, zeros, res); - return res; -} - -using hwy::HWY_NAMESPACE::InsertLane; -using hwy::HWY_NAMESPACE::ExtractLane; -using hwy::HWY_NAMESPACE::LoadU; - -template> -HWY_MATH_INLINE T MitchellNetravaliV(const D df, T d) { - const T C = Set(df, 1.0 / 3.0); - const T B = Set(df, 1.0 / 3.0); - return BCSpline(df, d, B, C); -} - -template> -HWY_MATH_INLINE T CubicHermiteV(const D df, T d) { - const T C = Set(df, 0.0); - const T B = Set(df, 0.0); - return BCSpline(df, d, B, C); -} - -template> -HWY_MATH_INLINE T CubicBSplineV(const D df, T d) { - const T C = Set(df, 0.0); - const T B = Set(df, 1.0); - return BCSpline(df, d, B, C); -} - -template> -HWY_MATH_INLINE T BiCubicSplineV(const D df, T x) { - const hwy::HWY_NAMESPACE::TFromD a = -0.5; - const T aVec = Set(df, a); - const T ones = Set(df, 1.0); - const T two = Set(df, 2.0); - const T three = Set(df, 3.0); - const T four = Set(df, 4.0); - const T five = Set(df, 5.0); - const T eight = Set(df, 8.0); - const T zeros = Zero(df); - x = Abs(x); - const auto zeroMask = x >= two; - const auto partOneMask = x < ones; - const T doubled = Mul(x, x); - const T triplet = Mul(doubled, x); - - const T partOne = MulAdd(Add(two, aVec), triplet, NegMulAdd(Add(aVec, three), doubled, ones)); - const T fourA = Mul(four, aVec); - const T eightA = Mul(eight, aVec); - const T fiveA = Mul(five, aVec); - const T partTwo = MulAdd(aVec, triplet, - NegMulAdd(fiveA, doubled, - MulSub(eightA, x, fourA))); - - x = IfThenElse(partOneMask, partOne, partTwo); - x = IfThenElse(zeroMask, zeros, x); - - return x; -} - -template> -HWY_MATH_INLINE T SimpleCubicV(const D df, T x) { - x = Abs(x); - const T zeros = Zero(df); - const T ones = Set(df, 1.0); - const T two = Set(df, 2.0); - const T doubled = Mul(x, x); - const T tripled = Mul(doubled, x); - auto setZeroMask = x > two; - auto setP1Mask = x < ones; - auto setP2Mask = x >= ones; - const T mSix = Set(df, 6.0f); - const T sixScale = ApproximateReciprocal(mSix); - T res = Zero(df); - const T p1 = Mul(MulAdd(MulSub(Set(df, 3), x, mSix), Mul(x, x), Set(df, 4.0f)), sixScale); - const T p2 = Mul(MulAdd(MulSub(Sub(mSix, x), x, Set(df, 12.0f)), x, Set(df, 8.0f)), sixScale); - res = IfThenElse(setP1Mask, p1, zeros); - res = IfThenElse(setP2Mask, p2, res); - res = IfThenElse(setZeroMask, zeros, res); - return res; -} - -template> -HWY_MATH_INLINE T sincV(const D d, T x) { - const T ones = Set(d, 1); - const T zeros = Zero(d); - auto maskEqualToZero = x == zeros; - T sine = hwy::HWY_NAMESPACE::Sin(d, x); - x = IfThenElse(maskEqualToZero, ones, x); - T result = Div(sine, x); - result = IfThenElse(maskEqualToZero, ones, result); - return result; -} - -template> -HWY_MATH_INLINE T LanczosWindowHWY(const D df, T x, const T a) { - auto mask = Abs(x) >= a; - T v = Mul(Set(df, M_PI), x); - T r = Mul(sincV(df, v), sincV(df, Div(v, a))); - return IfThenZeroElse(mask, r); -} - -template> -HWY_MATH_INLINE T CatmullRomV(const D df, T d) { - const T C = Set(df, 0.0); - const T B = Set(df, 0.5); - return BCSpline(df, d, B, C); -} - -using hwy::HWY_NAMESPACE::Lerp; - -template> -HWY_MATH_INLINE T Blerp(const D df, T c00, T c10, T c01, T c11, T tx, T ty) { - return Lerp(df, Lerp(df, c00, c10, tx), Lerp(df, c01, c11, tx), ty); -} - -template> -HWY_MATH_INLINE T HannWindow(const D df, const T n, const float length) { - const float size = length * 2; - const T sizeV = Set(df, size); - const T lengthV = Set(df, length); - auto mask = Abs(n) > Set(df, length); - const T piMulSize = Set(df, M_PI / size); - T res = hwy::HWY_NAMESPACE::Cos(df, Mul(piMulSize, n)); - res = Mul(Mul(res, res), ApproximateReciprocal(sizeV)); - res = IfThenZeroElse(mask, res); - return res; -} - -template> -HWY_MATH_INLINE T J1(const D df, T x) { - T p = Set(df, 0.270112271089232341485679099e+4); - T q = Set(df, 0.1e+1); - - const auto dX = Mul(x, x); - - p = MulAdd(p, dX, Set(df, -0.4695753530642995859767162166e+7)); - q = MulAdd(q, dX, Set(df, 0.1606931573481487801970916749e+4)); - - p = MulAdd(p, dX, Set(df, 0.3413234182301700539091292655e+10)); - q = MulAdd(q, dX, Set(df, 0.1501793594998585505921097578e+7)); - - p = MulAdd(p, dX, Set(df, -0.1322983480332126453125473247e+13)); - q = MulAdd(q, dX, Set(df, 0.1013863514358673989967045588e+10)); - - p = MulAdd(p, dX, Set(df, 0.2908795263834775409737601689e+15)); - q = MulAdd(q, dX, Set(df, 0.5243710262167649715406728642e+12)); - - p = MulAdd(p, dX, Set(df, -0.3588817569910106050743641413e+17)); - q = MulAdd(q, dX, Set(df, 0.2081661221307607351240184229e+15)); - - p = MulAdd(p, dX, Set(df, 0.2316433580634002297931815435e+19)); - q = MulAdd(q, dX, Set(df, 0.6092061398917521746105196863e+17)); - - p = MulAdd(p, dX, Set(df, -0.6672106568924916298020941484e+20)); - q = MulAdd(q, dX, Set(df, 0.1185770712190320999837113348e+20)); - - p = MulAdd(p, dX, Set(df, 0.581199354001606143928050809e+21)); - q = MulAdd(q, dX, Set(df, 0.11623987080032122878585294e+22)); - - const auto zeros = Zero(df); - const auto ones = Set(df, 1.0f); - q = IfThenElse(q == zeros, ones, q); - - return Div(p, q); -} - -template> -HWY_MATH_INLINE T Q1(const D df, T x) { - static const float - Pone[] = { - 0.3511751914303552822533318e+3, - 0.7210391804904475039280863e+3, - 0.4259873011654442389886993e+3, - 0.831898957673850827325226e+2, - 0.45681716295512267064405e+1, - 0.3532840052740123642735e-1 - }, - Qone[] = { - 0.74917374171809127714519505e+4, - 0.154141773392650970499848051e+5, - 0.91522317015169922705904727e+4, - 0.18111867005523513506724158e+4, - 0.1038187585462133728776636e+3, - 0.1e+1 - }; - - T p = Set(df, Pone[5]); - T q = Set(df, Qone[5]); - - const auto zeros = Zero(df); - const auto ones = Set(df, 1.0f); - - const auto eights = Set(df, 8.0); - - x = IfThenElse(x == zeros, ones, x); - - const auto recX = Div(eights, x); - - const auto dX = Mul(recX, recX); - - for (int i = 4; i >= 0; i--) { - p = MulAdd(p, dX, Set(df, Pone[i])); - q = MulAdd(q, dX, Set(df, Pone[i])); - } - - q = IfThenElse(q == zeros, ones, q); - - auto res = Div(p, q); - res = IfThenElse(x == zeros, zeros, res); - return res; -} - -template> -HWY_MATH_INLINE T P1(const D df, T x) { - static const float - Pone[] = { - 0.352246649133679798341724373e+5, - 0.62758845247161281269005675e+5, - 0.313539631109159574238669888e+5, - 0.49854832060594338434500455e+4, - 0.2111529182853962382105718e+3, - 0.12571716929145341558495e+1 - }, - Qone[] = { - 0.352246649133679798068390431e+5, - 0.626943469593560511888833731e+5, - 0.312404063819041039923015703e+5, - 0.4930396490181088979386097e+4, - 0.2030775189134759322293574e+3, - 0.1e+1 - }; - - T p = Set(df, Pone[5]); - T q = Set(df, Qone[5]); - - const auto zeros = Zero(df); - const auto ones = Set(df, 1.0f); - - const auto eights = Set(df, 8.0); - - x = IfThenElse(x == zeros, ones, x); - - const auto recX = Div(eights, x); - - const auto dX = Mul(recX, recX); - - for (int i = 4; i >= 0; i--) { - p = MulAdd(p, dX, Set(df, Pone[i])); - q = MulAdd(q, dX, Set(df, Pone[i])); - } - - q = IfThenElse(q == zeros, ones, q); - - auto res = Div(p, q); - res = IfThenElse(x == zeros, zeros, res); - return res; -} - -template> -HWY_MATH_INLINE T BesselOrderOne(const D df, T x) { - auto p = x; - x = Abs(x); - const auto minZValue = Set(df, 1e-8); - auto zerosMask = x < minZValue; - auto res = Mul(J1(df, x), p); - const auto zeros = Zero(df); - res = IfThenElse(zerosMask, zeros, res); - return res; -} - -template> -HWY_MATH_INLINE T jinc(const D d, T x) { - const T ones = Set(d, 1); - const T zeros = Zero(d); - auto maskEqualToZero = x == zeros; - const auto minZValue = Set(d, 1e-8); - auto zerosMask = x < minZValue; - x = IfThenElse(maskEqualToZero, ones, x); - const T pi = Set(d, M_PI); - T result = Div(BesselOrderOne(d, Mul(pi, x)), x); - result = IfThenElse(maskEqualToZero, zeros, result); - result = IfThenElse(zerosMask, Set(d, 0.5 * M_PI), result); - return result; -} - -template> -HWY_MATH_INLINE T LanczosJinc(const D df, T x, const T a) { - auto mask = Abs(x) >= a; - T v = Mul(Set(df, M_PI), x); - T r = Mul(jinc(df, v), jinc(df, Div(v, a))); - return IfThenZeroElse(mask, r); -} - -template> -HWY_MATH_INLINE T Lanczos3Jinc(const D df, T x) { - return LanczosJinc(df, x, Set(df, 3.0f)); -} - -template> -HWY_MATH_INLINE T Lanczos3Sinc(const D df, T x) { - return LanczosWindowHWY(df, x, Set(df, 3.0f)); -} -} -HWY_AFTER_NAMESPACE(); - -#endif \ No newline at end of file diff --git a/src/sampler/sampler.h b/src/sampler/sampler.h deleted file mode 100644 index ff5598e..0000000 --- a/src/sampler/sampler.h +++ /dev/null @@ -1,174 +0,0 @@ -// Copyright (C) 2024 Radzivon Bartoshyk -// -// This file belongs to sparkyuv project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SPARKYUV_SAMPLER_H_ONCE -#define SPARKYUV_SAMPLER_H_ONCE - -#ifdef _MSC_VER -#define _USE_MATH_DEFINES -#include -#endif -#include - -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -namespace sparkyuv { -using namespace std; - -template -static inline D PromoteTo(T t, float maxColors) { - D result = static_cast((float) t / maxColors); - return result; -} - -template -static inline D DemoteTo(T t, float maxColors) { - return (D) clamp(((float) t * (float) maxColors), 0.0f, (float) maxColors); -} - -template -static inline float BCSpline(T x, const T B, const T C) { - if (x < 0.0f) x = -x; - - const T dp = x * x; - const T tp = dp * x; - - if (x < 1.0f) - return ((12 - 9 * B - 6 * C) * tp + (-18 + 12 * B + 6 * C) * dp + (6 - 2 * B)) * - (T(1) / T(6)); - else if (x < 2.0f) - return ((-B - 6 * C) * tp + (6 * B + 30 * C) * dp + (-12 * B - 48 * C) * x + - (8 * B + 24 * C)) * (T(1) / T(6)); - - return (0.0f); -} - -template -static inline T SimpleCubic(T x) { - if (x < 0.0f) x = -x; - - if (x < 1.0f) - return (4.0f + x * x * (3.0f * x - 6.0f)) / 6.0f; - else if (x < 2.0f) - return (8.0f + x * (-12.0f + x * (6.0f - x))) / 6.0f; - - return (0.0f); -} - -template -static inline T BiCubicSpline(T x) { - const T a = -0.5; - const T modulo = abs(x); - if (modulo >= 2) { - return 0; - } - const T floatd = modulo * modulo; - const T triplet = floatd * modulo; - if (modulo <= 1) { - return (a + T(2.0))*triplet - (a + T(3.0)) * floatd + T(1.0); - } - return a * triplet - T(5.0) * a * floatd + T(8.0) * a * modulo - T(4.0) * a; -} - -template -static inline T CubicHermite(T x) { - constexpr T C = T(0.0); - constexpr T B = T(0.0); - return BCSpline(x, B, C); -} - -template -static inline float BSpline(T x) { - constexpr T C = T(0.0); - constexpr T B = T(1.0); - return BCSpline(x, B, C); -} - -template -static inline float MitchellNetravalli(T x) { - constexpr T B = 1.0f / 3.0f; - constexpr T C = 1.0f / 3.0f; - return BCSpline(x, B, C); -} - -template -static inline T sinc(T x) { - if (x == 0.0) { - return T(1.0); - } else { - return sin(x) / x; - } -} - -template -static inline T LanczosWindow(T x, const T a) { - if (abs(x) < a) { - return sinc(T(M_PI) * x) * sinc(T(M_PI) * x / a); - } - return T(0.0); -} - -template -static inline T fastCos(T x) { - constexpr T C0 = 0.99940307; - constexpr T C1 = -0.49558072; - constexpr T C2 = 0.03679168; - constexpr T C3 = -0.00434102; - - while (x < -2 * M_PI) { - x += 2.0 * M_PI; - } - while (x > 2 * M_PI) { - x -= 2.0 * M_PI; - } - - // Calculate cos(x) using Chebyshev polynomial approximation - T x2 = x * x; - T result = C0 + x2 * (C1 + x2 * (C2 + x2 * C3)); - return result; -} - -template -static inline T CatmullRom(T x) { - return BCSpline(x, 0.0f, 0.5f); -} - -template -static inline T HannWindow(const T n, const T length) { - const T size = length * 2; - const T part = M_PI / size; - if (abs(n) > length) { - return 0; - } - T r = cos(n * part); - r = r * r; - return r / size; -} - -template -static inline T blerp(T c00, T c10, T c01, T c11, T tx, T ty) { - return lerp(lerp(c00, c10, tx), lerp(c01, c11, tx), ty); -} - -template -static inline T Lanczos3Sinc(T x) { - return LanczosWindow(x, 3.0f); -} - -} -#endif // SPARKYUV_SAMPLER_H_ONCE \ No newline at end of file diff --git a/tools/bench.h b/tools/bench.h index 89adeb6..a1611b7 100644 --- a/tools/bench.h +++ b/tools/bench.h @@ -17,6 +17,7 @@ #pragma once #include +#include static void bench(int iterations, const char *color, const char *mark, const std::function &func) { double totalTime = 0; diff --git a/tools/bench/YuvBenchmarkBase.cpp b/tools/bench/YuvBenchmarkBase.cpp index 585efd6..6e835c4 100644 --- a/tools/bench/YuvBenchmarkBase.cpp +++ b/tools/bench/YuvBenchmarkBase.cpp @@ -267,4 +267,4 @@ void SparkyuvFastGuassianRGBA(benchmark::State &state) { for (auto _ : state) { sparkyuv::FastGaussianBlurRGBA(reinterpret_cast(rgbaData.data()), rgbaStride, inWidth, inHeight, 15); } -} \ No newline at end of file +} diff --git a/tools/main.cpp b/tools/main.cpp index 88fe41b..0b30904 100644 --- a/tools/main.cpp +++ b/tools/main.cpp @@ -317,10 +317,6 @@ int main() { sparkyuv::TransposeClockwiseRGBA(rgbaData.data(), rgbaStride, transposed.data(), trnsStride, width, height); }); - bench(1, ANSI_COLOR_YELLOW, "Fast Blur", [&]() { - sparkyuv::FastGaussianBlurRGBA(rgbaData.data(), rgbaStride, width, height, 25); - }); - // std::vector f16Store(width * 4 * sizeof(uint16_t) * height); // // sparkyuv::RGBAToRGBAF16(rgbaData.data(),