diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb3d358..3780085 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,6 @@ set(SPARKYUV_SOURCES
         src/ChannelLength.cpp
         src/YCbCrP16.cpp
         src/ChannelsReformat.cpp
-        src/Scale.cpp
         src/NV12Flyer.cpp
         src/NV16Flyer.cpp
         src/NV24Flyer.cpp
@@ -44,7 +43,6 @@ set(SPARKYUV_SOURCES
         src/Rotate.cpp
         src/FastGaussian.cpp
         src/FastGaussian.h
-        src/GaussianBlur.cpp
         src/FastGaussianNeon.cpp
         src/FastGaussianNeon.h)
 
@@ -53,8 +51,7 @@ set(HWY_SOURCES
         highway/hwy/nanobenchmark.cc highway/hwy/per_target.cc highway/hwy/timer.cc
         highway/hwy/abort.cc
         src/Eotf.cpp
-        src/Eotf-inl.h
-        src/GaussianBlur-inl.h)
+        src/Eotf-inl.h)
 
 if (BUILD_SHARED)
     add_library(sparkyuv SHARED ${SPARKYUV_SOURCES} ${HWY_SOURCES})
diff --git a/include/sparkyuv-basic.h b/include/sparkyuv-basic.h
index 0671c05..659eecf 100644
--- a/include/sparkyuv-basic.h
+++ b/include/sparkyuv-basic.h
@@ -451,129 +451,6 @@ void FastGaussianNextBlurBGRAF16(uint16_t *data, uint32_t stride, uint32_t width
 void FastGaussianNextBlurBGRF16(uint16_t *data, uint32_t stride, uint32_t width, uint32_t height, int radius);
 #endif
 
-/**
- * Gaussian Blur.
- * Not approximation just a gaussian blur, use when antialias or clear gaussian methods is needed.
- * In-place use allowed
- */
-
-void GaussianBlurRGBA(const uint8_t *src, uint32_t srcStride,
-                      uint8_t *dst, uint32_t dstStride,
-                      uint32_t width, uint32_t height,
-                      int kernelSize, float sigma);
-void GaussianBlurRGB(const uint8_t *src, uint32_t srcStride,
-                     uint8_t *dst, uint32_t dstStride,
-                     uint32_t width, uint32_t height,
-                     int kernelSize, float sigma);
-void GaussianBlurChannel(const uint8_t *src, uint32_t srcStride,
-                         uint8_t *dst, uint32_t dstStride,
-                         uint32_t width, uint32_t height,
-                         int kernelSize, float sigma);
-
-void GaussianBlurRGBA16(const uint16_t *src, uint32_t srcStride,
-                        uint16_t *dst, uint32_t dstStride,
-                        uint32_t width, uint32_t height,
-                        int kernelSize, float sigma);
-void GaussianBlurRGB16(const uint16_t *src, uint32_t srcStride,
-                       uint16_t *dst, uint32_t dstStride,
-                       uint32_t width, uint32_t height,
-                       int kernelSize, float sigma);
-void GaussianBlurChannel16(const uint16_t *src, uint32_t srcStride,
-                           uint16_t *dst, uint32_t dstStride,
-                           uint32_t width, uint32_t height,
-                           int kernelSize, float sigma);
-
-void GaussianBlurRGBAF16(const uint16_t *src, uint32_t srcStride,
-                         uint16_t *dst, uint32_t dstStride,
-                         uint32_t width, uint32_t height,
-                         int kernelSize, float sigma);
-void GaussianBlurRGBF16(const uint16_t *src, uint32_t srcStride,
-                        uint16_t *dst, uint32_t dstStride,
-                        uint32_t width, uint32_t height,
-                        int kernelSize, float sigma);
-void GaussianBlurChannelF16(const uint16_t *src, uint32_t srcStride,
-                            uint16_t *dst, uint32_t dstStride,
-                            uint32_t width, uint32_t height,
-                            int kernelSize, float sigma);
-
-void GaussianBlurRGBAF32(const float *src, uint32_t srcStride,
-                         float *dst, uint32_t dstStride,
-                         uint32_t width, uint32_t height,
-                         int kernelSize, float sigma);
-void GaussianBlurRGBF32(const float *src, uint32_t srcStride,
-                        float *dst, uint32_t dstStride,
-                        uint32_t width, uint32_t height,
-                        int kernelSize, float sigma);
-void GaussianBlurChannelF32(const float *src, uint32_t srcStride,
-                            float *dst, uint32_t dstStride,
-                            uint32_t width, uint32_t height,
-                            int kernelSize, float sigma);
-
-/**
- * Scaling functions
- */
-
-// Mark scale U8
-
-void ScaleRGB(const uint8_t *input, uint32_t srcStride,
-              uint32_t inputWidth, uint32_t inputHeight,
-              uint8_t *output, uint32_t dstStride,
-              uint32_t outputWidth, uint32_t outputHeight,
-              SparkYuvSampler option);
-void ScaleRGBA(const uint8_t *input, uint32_t srcStride,
-               uint32_t inputWidth, uint32_t inputHeight,
-               uint8_t *output, uint32_t dstStride,
-               uint32_t outputWidth, uint32_t outputHeight,
-               SparkYuvSampler option);
-void ScaleChannel(const uint8_t *input, uint32_t srcStride,
-                  uint32_t inputWidth, uint32_t inputHeight,
-                  uint8_t *output, uint32_t dstStride,
-                  uint32_t outputWidth, uint32_t outputHeight,
-                  SparkYuvSampler option);
-// Mark scale F16
-
-
-void ScaleRGBF16(const uint16_t *input, uint32_t srcStride,
-                 uint32_t inputWidth, uint32_t inputHeight,
-                 uint16_t *output, uint32_t dstStride,
-                 uint32_t outputWidth, uint32_t outputHeight,
-                 SparkYuvSampler option);
-void ScaleRGBAF16(const uint16_t *input, uint32_t srcStride,
-                  uint32_t inputWidth, uint32_t inputHeight,
-                  uint16_t *output, uint32_t dstStride,
-                  uint32_t outputWidth, uint32_t outputHeight,
-                  SparkYuvSampler option);
-
-void ScaleChannelF16(const uint16_t *input, uint32_t srcStride,
-                     uint32_t inputWidth, uint32_t inputHeight,
-                     uint16_t *output, uint32_t dstStride,
-                     uint32_t outputWidth, uint32_t outputHeight,
-                     SparkYuvSampler option);
-
-// Mark: Scale RGBA1010102
-
-void ScaleRGBA1010102(const uint8_t *input, uint32_t srcStride,
-                      uint32_t inputWidth, uint32_t inputHeight,
-                      uint8_t *output, uint32_t dstStride,
-                      uint32_t outputWidth, uint32_t outputHeight,
-                      SparkYuvSampler option);
-
-void ScaleRGBA16(const uint16_t *input, uint32_t srcStride,
-                 uint32_t inputWidth, uint32_t inputHeight,
-                 uint16_t *output, uint32_t dstStride,
-                 uint32_t outputWidth, uint32_t outputHeight,
-                 int depth, SparkYuvSampler option);
-void ScaleRGB16(const uint16_t *input, uint32_t srcStride,
-                uint32_t inputWidth, uint32_t inputHeight,
-                uint16_t *output, uint32_t dstStride,
-                uint32_t outputWidth, uint32_t outputHeight,
-                int depth, SparkYuvSampler option);
-void ScaleChannel16(const uint16_t *input, uint32_t srcStride,
-                    uint32_t inputWidth, uint32_t inputHeight,
-                    uint16_t *output, uint32_t dstStride,
-                    uint32_t outputWidth, uint32_t outputHeight,
-                    int depth, SparkYuvSampler option);
-
 /**
  * Convert from U8 to F16
  */
diff --git a/src/FastGaussianNeon.cpp b/src/FastGaussianNeon.cpp
index 6891f9a..777fa48 100644
--- a/src/FastGaussianNeon.cpp
+++ b/src/FastGaussianNeon.cpp
@@ -76,7 +76,6 @@ void VerticalGaussianPassRGBANeon(uint8_t *data,
         src[px + 2] = vget_lane_u8(p8, 2);
 
         int32x4_t bufferValue1 = vld1q_s32(reinterpret_cast<const int *>(&buffer[arrIndex][0]));
-
         int32x4_t bufferValue2 = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
         bufferValue2 = vshlq_n_s32(bufferValue2, 1);
 
diff --git a/src/GaussianBlur-inl.h b/src/GaussianBlur-inl.h
deleted file mode 100644
index 2ab523b..0000000
--- a/src/GaussianBlur-inl.h
+++ /dev/null
@@ -1,697 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(SPARKYUV_FLIP_INL_H) == defined(HWY_TARGET_TOGGLE)
-#ifdef SPARKYUV_FLIP_INL_H
-#undef SPARKYUV_FLIP_INL_H
-#else
-#define SPARKYUV_FLIP_INL_H
-#endif
-
-#include "hwy/highway.h"
-#include "yuv-inl.h"
-#include "sparkyuv-internal.h"
-#include "math/gaussian.h"
-#include "hwy/aligned_allocator.h"
-#include "concurrency.hpp"
-#include "TypeSupport.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-
-using namespace hwy;
-using namespace hwy::HWY_NAMESPACE;
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_4, int>::type = 0,
-    typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
-void
-GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                           T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                           const uint32_t startY, const uint32_t endY,
-                           const uint32_t width, const uint32_t /* height */,
-                           const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int maxWidth = static_cast<int>(width) - 1;
-  int sZero = 0;
-
-  const FixedTag<uint8_t, 16> d8x16;
-  const Half<decltype(d8x16)> dh8;
-  const Rebind<uint16_t, decltype(dh8)> d16;
-  const FixedTag<uint8_t, 4> d8x4;
-  const FixedTag<uint32_t, 4> d32;
-  const FixedTag<float, 4> df;
-  using VF = Vec<decltype(df)>;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      VF acc = Zero(df);
-      auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + y * srcStride);
-      auto kx = static_cast<int>(x);
-
-      for (; r + 4 <= maxKernel && kx + r + 4 < width; r += 4) {
-        int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4;
-        auto vx = LoadU(d8x16, &localSource[sourcePX]);
-        auto i1 = ConvertTo(df, PromoteLowerTo(d32, LowerHalf(vx)));
-        auto i2 = ConvertTo(df, PromoteLowerTo(d32, UpperHalf(dh8, vx)));
-        auto i3 = ConvertTo(df, PromoteUpperTo(d32, PromoteTo(d16, LowerHalf(vx))));
-        auto i4 = ConvertTo(df, PromoteUpperTo(d32, PromoteTo(d16, UpperHalf(dh8, vx))));
-
-        float weight1 = mKernel[halfOfKernel + r];
-        acc = MulAdd(i1, Set(df, weight1), acc);
-
-        float weight2 = mKernel[halfOfKernel + r + 1];
-        acc = MulAdd(i2, Set(df, weight2), acc);
-
-        float weight3 = mKernel[halfOfKernel + r + 2];
-        acc = MulAdd(i3, Set(df, weight3), acc);
-
-        float weight4 = mKernel[halfOfKernel + r + 3];
-        acc = MulAdd(i4, Set(df, weight4), acc);
-      }
-
-      for (; r <= maxKernel; ++r) {
-        float weight = mKernel[halfOfKernel + r];
-        int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4;
-        auto vx = ConvertTo(df, PromoteTo(d32, LoadU(d8x4, &localSource[sourcePX])));
-        acc = MulAdd(vx, Set(df, weight), acc);
-      }
-      acc = Round(acc);
-      auto newPX = DemoteTo(d8x4, ConvertTo(d32, acc));
-      StoreU(newPX, d8x4, dst);
-      dst += 4;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_4, int>::type = 0,
-    typename std::enable_if<!std::is_same<T, uint8_t>::value, int>::type = 0,
-    ENABLE_TYPE_IS_F16(T)>
-void
-GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                         T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                         const uint32_t startY, const uint32_t endY,
-                         const uint32_t width, const uint32_t height,
-                         const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int64_t maxHeight = static_cast<int>(height) - 1;
-
-  const FixedTag<uint16_t, 8> d16x8;
-  const FixedTag<uint16_t, 4> d16x4;
-  const FixedTag<hwy::float16_t, 8> df16;
-  const FixedTag<hwy::float16_t, 4> df16x4;
-  const FixedTag<hwy::float32_t, 4> df;
-  using VF = Vec<decltype(df)>;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-
-      VF accumulator = Zero(df);
-
-      auto kx = static_cast<int>(x) * 4;
-
-      for (; r <= maxKernel; ++r) {
-        uint32_t shiftX = std::clamp(static_cast<int64_t>(y) + static_cast<int64_t>(r),
-                                     static_cast<int64_t>(0),
-                                     static_cast<int64_t>(maxHeight));
-        auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + shiftX * srcStride);
-        float weight = mKernel[halfOfKernel + r];
-        VF pixelData;
-#if SPARKYUV_ALLOW_FLOAT16
-        const auto pxf16 = LoadU(df16x4, &localSource[kx]);
-        pixelData = PromoteTo(df, pxf16);
-#else
-        const auto pxf16 = BitCast(df16x4, LoadU(d16x4, reinterpret_cast<const uint16_t*>(&localSource[kx])));
-        pixelData = PromoteTo(df, pxf16);
-#endif
-        accumulator = MulAdd(pixelData, Set(df, weight), accumulator);
-      }
-
-#if SPARKYUV_ALLOW_FLOAT16
-      StoreU(DemoteTo(df16x4, accumulator), df16x4, dst);
-#else
-      auto duStore = BitCast(d16x4, DemoteTo(df16x4, accumulator));
-      StoreU(duStore, d16x4, reinterpret_cast<uint16_t*>(dst));
-#endif
-
-      dst += 4;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_4, int>::type = 0,
-    typename std::enable_if<!std::is_same<T, uint8_t>::value, int>::type = 0,
-    ENABLE_TYPE_IS_F16(T)>
-void
-GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                           T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                           const uint32_t startY, const uint32_t endY,
-                           const uint32_t width, const uint32_t /* height */,
-                           const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int maxWidth = static_cast<int>(width) - 1;
-  int sZero = 0;
-
-  const FixedTag<uint16_t, 8> d16x8;
-  const FixedTag<uint16_t, 4> d16x4;
-  const FixedTag<hwy::float16_t, 8> df16;
-  const FixedTag<hwy::float16_t, 4> df16x4;
-  const FixedTag<hwy::float32_t, 4> df;
-  using VF = Vec<decltype(df)>;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + y * srcStride);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-
-      VF accumulator = Zero(df);
-      auto kx = static_cast<int>(x);
-
-      for (; r + 2 <= maxKernel && kx + x + 2 < width; r += 2) {
-        int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4;
-        auto movedSrc = localSource + sourcePX;
-        const float weight1 = mKernel[halfOfKernel + r];
-        const float weight2 = mKernel[halfOfKernel + r + 1];
-        VF pixelData1;
-        VF pixelData2;
-
-#if SPARKYUV_ALLOW_FLOAT16
-        const auto pxf16 = LoadU(df16, movedSrc);
-        pixelData1 = PromoteLowerTo(df, pxf16);
-        pixelData2 = PromoteUpperTo(df, pxf16);
-#else
-        const auto pxf16 = BitCast(df16, LoadU(d16x8, reinterpret_cast<const uint16_t*>(movedSrc)));
-        pixelData1 = PromoteLowerTo(df, pxf16);
-        pixelData2 = PromoteUpperTo(df, pxf16);
-#endif
-
-        accumulator = MulAdd(pixelData1, Set(df, weight1), accumulator);
-        accumulator = MulAdd(pixelData2, Set(df, weight2), accumulator);
-      }
-
-      for (; r <= maxKernel; ++r) {
-        float weight = mKernel[halfOfKernel + r];
-        int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4;
-        auto movedSrc = localSource + sourcePX;
-        VF pixelData;
-#if SPARKYUV_ALLOW_FLOAT16
-        const auto pxf16 = LoadU(df16x4, movedSrc);
-        pixelData = PromoteTo(df, pxf16);
-#else
-        const auto pxf16 = BitCast(df16x4, LoadU(d16x4, reinterpret_cast<const uint16_t*>(movedSrc)));
-        pixelData = PromoteTo(df, pxf16);
-#endif
-        accumulator = MulAdd(pixelData, Set(df, weight), accumulator);
-      }
-
-#if SPARKYUV_ALLOW_FLOAT16
-      StoreU(DemoteTo(df16x4, accumulator), df16x4, dst);
-#else
-      auto duStore = BitCast(d16x4, DemoteTo(df16x4, accumulator));
-      StoreU(duStore, d16x4, reinterpret_cast<uint16_t*>(dst));
-#endif
-
-      dst += 4;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_4, int>::type = 0,
-    typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0,
-    ENABLE_TYPE_IS_F16(T)>
-void
-GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                         T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                         const uint32_t startY, const uint32_t endY,
-                         const uint32_t width, const uint32_t height,
-                         const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int64_t maxHeight = static_cast<int>(height) - 1;
-
-  const FixedTag<uint8_t, 4> d8x4;
-  const FixedTag<uint32_t, 4> d32;
-  const FixedTag<float, 4> df;
-  using VF = Vec<decltype(df)>;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      VF acc = Zero(df);
-
-      for (; r <= maxKernel; ++r) {
-        uint32_t shiftX = std::clamp(static_cast<int64_t>(y) + static_cast<int64_t>(r),
-                                     static_cast<int64_t>(0),
-                                     static_cast<int64_t>(maxHeight));
-        auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + shiftX * srcStride);
-        float weight = mKernel[halfOfKernel + r];
-        uint32_t sourcePX = x * 4;
-        auto vx = ConvertTo(df, PromoteTo(d32, LoadU(d8x4, &localSource[sourcePX])));
-        acc = MulAdd(vx, Set(df, weight), acc);
-      }
-      acc = Round(acc);
-      auto newPX = DemoteTo(d8x4, ConvertTo(d32, acc));
-      StoreU(newPX, d8x4, dst);
-      dst += 4;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_4, int>::type = 0,
-    typename std::enable_if<!std::is_same<T, uint8_t>::value, int>::type = 0,
-    ENABLE_TYPE_IS_NOT_F16(T)>
-void
-GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                           T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                           const uint32_t startY, const uint32_t endY,
-                           const uint32_t width, const uint32_t /* height */,
-                           const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int maxWidth = static_cast<int>(width) - 1;
-  int sZero = 0;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      float accumulator1 = 0.f;
-      float accumulator2 = 0.f;
-      float accumulator3 = 0.f;
-      float accumulator4 = 0.f;
-      auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + y * srcStride);
-      auto kx = static_cast<int>(x);
-      for (; r <= maxKernel; ++r) {
-        float weight = mKernel[halfOfKernel + r];
-        int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4;
-        auto movedSrc = localSource + sourcePX;
-        // Stupid workaround to avoid errors where hwy f16 not really properly works
-        accumulator1 += LoadFloat<T>(&movedSrc[0]) * weight;
-        accumulator2 += LoadFloat<T>(&movedSrc[1]) * weight;
-        accumulator3 += LoadFloat<T>(&movedSrc[2]) * weight;
-        accumulator4 += LoadFloat<T>(&movedSrc[3]) * weight;
-      }
-      if (!std::is_same<hwy::float16_t, T>::value) {
-        StoreRoundedFloat(&dst[0], accumulator1);
-        StoreRoundedFloat(&dst[1], accumulator2);
-        StoreRoundedFloat(&dst[2], accumulator3);
-        StoreRoundedFloat(&dst[3], accumulator4);
-      } else {
-        StoreFloat(&dst[0], accumulator1);
-        StoreFloat(&dst[1], accumulator2);
-        StoreFloat(&dst[2], accumulator3);
-        StoreFloat(&dst[3], accumulator4);
-      }
-      dst += 4;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_3, int>::type = 0>
-void
-GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                           T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                           const uint32_t startY, const uint32_t endY,
-                           const uint32_t width, const uint32_t /* height */,
-                           const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int maxWidth = static_cast<int>(width) - 1;
-  int sZero = 0;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      float accumulator1 = 0.f;
-      float accumulator2 = 0.f;
-      float accumulator3 = 0.f;
-      auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + y * srcStride);
-      auto kx = static_cast<int>(x);
-      for (; r <= maxKernel; ++r) {
-        float weight = mKernel[halfOfKernel + r];
-        int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 3;
-        accumulator1 += LoadFloat<T>(&localSource[sourcePX]) * weight;
-        accumulator2 += LoadFloat<T>(&localSource[sourcePX + 1]) * weight;
-        accumulator3 += LoadFloat<T>(&localSource[sourcePX + 2]) * weight;
-      }
-      if (!std::is_same<T, hwy::float16_t>::value) {
-        StoreRoundedFloat(&dst[0], accumulator1);
-        StoreRoundedFloat(&dst[1], accumulator2);
-        StoreRoundedFloat(&dst[2], accumulator3);
-      } else {
-        StoreFloat(&dst[0], accumulator1);
-        StoreFloat(&dst[1], accumulator2);
-        StoreFloat(&dst[2], accumulator3);
-      }
-      dst += 3;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNEL, int>::type = 0>
-void
-GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                           T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                           const uint32_t startY, const uint32_t endY,
-                           const uint32_t width, const uint32_t /* height */,
-                           const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int maxWidth = static_cast<int>(width) - 1;
-  int sZero = 0;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      float accumulator = 0.f;
-      auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + y * srcStride);
-      auto kx = static_cast<int>(x);
-      for (; r <= maxKernel; ++r) {
-        accumulator += LoadFloat<T>(&localSource[std::clamp(kx + r, sZero, maxWidth)]) * mKernel[halfOfKernel + r];
-      }
-      if (!std::is_same<T, hwy::float16_t>::value) {
-        StoreRoundedFloat(&dst[0], accumulator);
-      } else {
-        StoreFloat(&dst[0], accumulator);
-      }
-      dst += 1;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNEL, int>::type = 0>
-void
-GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                         T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                         const uint32_t startY, const uint32_t endY,
-                         const uint32_t width, const uint32_t height,
-                         const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int64_t maxHeight = static_cast<int>(height) - 1;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      float accumulator = 0.f;
-      auto kx = static_cast<int>(x);
-      for (; r <= maxKernel; ++r) {
-        uint32_t shiftX = std::clamp(static_cast<int64_t>(y) + static_cast<int64_t>(r),
-                                     static_cast<int64_t>(0),
-                                     static_cast<int64_t>(maxHeight));
-        auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + shiftX * srcStride);
-        // Stupid workaround to avoid errors where hwy f16 not really properly works
-        accumulator += LoadFloat<T>(&localSource[kx]) * mKernel[halfOfKernel + r];
-      }
-      if (!std::is_same<T, hwy::float16_t>::value) {
-        StoreRoundedFloat(&dst[0], accumulator);
-      } else {
-        StoreFloat(&dst[0], accumulator);
-      }
-      dst += 1;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_3, int>::type = 0>
-void
-GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                         T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                         const uint32_t startY, const uint32_t endY,
-                         const uint32_t width, const uint32_t height,
-                         const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int64_t maxHeight = static_cast<int>(height) - 1;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      float accumulator = 0.f;
-      float accumulator1 = 0.f;
-      float accumulator2 = 0.f;
-      auto kx = static_cast<int>(x) * 3;
-      for (; r <= maxKernel; ++r) {
-        uint32_t shiftX = std::clamp(static_cast<int64_t>(y) + static_cast<int64_t>(r),
-                                     static_cast<int64_t>(0),
-                                     static_cast<int64_t>(maxHeight));
-        auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + shiftX * srcStride);
-        float weight = mKernel[halfOfKernel + r];
-        accumulator += LoadFloat<T>(&localSource[kx]) * weight;
-        accumulator1 += LoadFloat<T>(&localSource[kx + 1]) * weight;
-        accumulator2 += LoadFloat<T>(&localSource[kx + 2]) * weight;
-      }
-      if (!std::is_same<T, hwy::float16_t>::value) {
-        StoreRoundedFloat(&dst[0], accumulator);
-        StoreRoundedFloat(&dst[1], accumulator1);
-        StoreRoundedFloat(&dst[2], accumulator2);
-      } else {
-        StoreFloat(&dst[0], accumulator);
-        StoreFloat(&dst[1], accumulator1);
-        StoreFloat(&dst[2], accumulator2);
-      }
-      dst += 3;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_4, int>::type = 0,
-    typename std::enable_if<!std::is_same<T, uint8_t>::value, int>::type = 0,
-    HWY_IF_NOT_F16(T)>
-void
-GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                         T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                         const uint32_t startY, const uint32_t endY,
-                         const uint32_t width, const uint32_t height,
-                         const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int64_t maxHeight = static_cast<int>(height) - 1;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      float accumulator = 0.f;
-      float accumulator1 = 0.f;
-      float accumulator2 = 0.f;
-      float accumulator3 = 0.f;
-      auto kx = static_cast<int>(x) * 4;
-      for (; r <= maxKernel; ++r) {
-        uint32_t shiftX = std::clamp(static_cast<int64_t>(y) + static_cast<int64_t>(r),
-                                     static_cast<int64_t>(0),
-                                     static_cast<int64_t>(maxHeight));
-        auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + shiftX * srcStride);
-        float weight = mKernel[halfOfKernel + r];
-        // Stupid workaround to avoid errors where hwy f16 not really properly works
-        accumulator += LoadFloat<T>(&localSource[kx]) * weight;
-        accumulator1 += LoadFloat<T>(&localSource[kx + 1]) * weight;
-        accumulator2 += LoadFloat<T>(&localSource[kx + 2]) * weight;
-        accumulator3 += LoadFloat<T>(&localSource[kx + 3]) * weight;
-      }
-      if (!std::is_same<T, hwy::float16_t>::value) {
-        StoreRoundedFloat(&dst[0], accumulator);
-        StoreRoundedFloat(&dst[1], accumulator1);
-        StoreRoundedFloat(&dst[2], accumulator2);
-        StoreRoundedFloat(&dst[3], accumulator3);
-      } else {
-        StoreFloat(&dst[0], accumulator);
-        StoreFloat(&dst[1], accumulator1);
-        StoreFloat(&dst[2], accumulator2);
-        StoreFloat(&dst[3], accumulator3);
-      }
-      dst += 4;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface,
-    typename std::enable_if<Surface == sparkyuv::SURFACE_CHANNELS_4, int>::type = 0,
-    typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
-void
-GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                         T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride,
-                         const uint32_t startY, const uint32_t endY,
-                         const uint32_t width, const uint32_t height,
-                         const float *mKernel, const int kernelSize) {
-  const int halfOfKernel = kernelSize / 2;
-  const bool isEven = kernelSize % 2 == 0;
-  const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel;
-
-  auto mDst = reinterpret_cast<uint8_t *>(mDestination);
-  int64_t maxHeight = static_cast<int>(height) - 1;
-
-  const FixedTag<uint8_t, 4> d8x4;
-  const FixedTag<uint32_t, 4> d32;
-  const FixedTag<float, 4> df;
-  using VF = Vec<decltype(df)>;
-
-  for (uint32_t y = startY; y < endY; ++y) {
-    auto dst = reinterpret_cast<T *>(mDst + dstStride * y);
-    for (uint32_t x = 0; x < width; ++x) {
-      int r = -halfOfKernel;
-      VF acc = Zero(df);
-
-      for (; r <= maxKernel; ++r) {
-        uint32_t shiftX = std::clamp(static_cast<int64_t>(y) + static_cast<int64_t>(r),
-                                     static_cast<int64_t>(0),
-                                     static_cast<int64_t>(maxHeight));
-        auto localSource = reinterpret_cast<const T *>(reinterpret_cast<const uint8_t *>(mSource) + shiftX * srcStride);
-        float weight = mKernel[halfOfKernel + r];
-        uint32_t sourcePX = x * 4;
-        auto vx = ConvertTo(df, PromoteTo(d32, LoadU(d8x4, &localSource[sourcePX])));
-        acc = MulAdd(vx, Set(df, weight), acc);
-      }
-      acc = Round(acc);
-      auto newPX = DemoteTo(d8x4, ConvertTo(d32, acc));
-      StoreU(newPX, d8x4, dst);
-      dst += 4;
-    }
-  }
-}
-
-template<class T, SparkYuvSurfaceChannels Surface>
-void
-GaussianBlurImpl(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride,
-                 T *SPARKYUV_RESTRICT mDestination, const uint32_t newStride,
-                 const uint32_t width, const uint32_t height, const int kernelSize,
-                 const float sigma) {
-  const auto kernel = Get1DGaussianKernel(kernelSize, sigma);
-  const auto transient = hwy::AllocateAligned<uint8_t>(newStride * height);
-  const auto threadCount = concurrency::getThreadCounts(width, height);
-  const auto alignedKernel = hwy::AllocateAligned<float>(kernel.size());
-  std::copy(kernel.begin(), kernel.end(), alignedKernel.get());
-  concurrency::parallel_for_segment(threadCount, height, [&](int start, int end) {
-    GaussianBlurHorizontalPass<T, Surface>(mSource,
-                                           srcStride,
-                                           reinterpret_cast<T *>(transient.get()),
-                                           newStride,
-                                           start,
-                                           end,
-                                           width,
-                                           height,
-                                           reinterpret_cast<const float *>(alignedKernel.get()),
-                                           kernel.size());
-  });
-
-  concurrency::parallel_for_segment(threadCount, height, [&](int start, int end) {
-    GaussianBlurVerticalPass<T, Surface>(reinterpret_cast<const T *>(transient.get()),
-                                         newStride,
-                                         mDestination,
-                                         newStride,
-                                         start,
-                                         end,
-                                         width,
-                                         height,
-                                         reinterpret_cast<const float *>(alignedKernel.get()),
-                                         kernel.size());
-  });
-}
-
-#define GAUSSIAN_BLUR_DECLARATION_R(srcPixel, storageType, surfaceType) \
-    void GaussianBlur##srcPixel##HWY(const storageType *SPARKYUV_RESTRICT src, const uint32_t srcStride,\
-                                    storageType *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\
-                                    const uint32_t width, const uint32_t height,  \
-                                    const int kernelSize, const float sigma) {\
-        GaussianBlurImpl<storageType, sparkyuv::SURFACE_##surfaceType>(src, srcStride, dst, dstStride,\
-                                                                       width, height, kernelSize, sigma); \
-    }
-
-GAUSSIAN_BLUR_DECLARATION_R(RGBA, uint8_t, CHANNELS_4)
-GAUSSIAN_BLUR_DECLARATION_R(RGB, uint8_t, CHANNELS_3)
-GAUSSIAN_BLUR_DECLARATION_R(Channel, uint8_t, CHANNEL)
-
-GAUSSIAN_BLUR_DECLARATION_R(RGBA16, uint16_t, CHANNELS_4)
-GAUSSIAN_BLUR_DECLARATION_R(RGB16, uint16_t, CHANNELS_3)
-GAUSSIAN_BLUR_DECLARATION_R(Channel16, uint16_t, CHANNEL)
-
-GAUSSIAN_BLUR_DECLARATION_R(RGBAF32, float, CHANNELS_4)
-GAUSSIAN_BLUR_DECLARATION_R(RGBF32, float, CHANNELS_3)
-GAUSSIAN_BLUR_DECLARATION_R(ChannelF32, float, CHANNEL)
-
-#undef GAUSSIAN_BLUR_DECLARATION_R
-
-#define GAUSSIAN_BLUR_DECLARATION_R_F16(srcPixel, surfaceType) \
-    void GaussianBlur##srcPixel##HWY(const uint16_t *SPARKYUV_RESTRICT src, const uint32_t srcStride,\
-                                    uint16_t *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\
-                                    const uint32_t width, const uint32_t height,  \
-                                    const int kernelSize, const float sigma) {\
-        GaussianBlurImpl<hwy::float16_t, sparkyuv::SURFACE_##surfaceType>(reinterpret_cast<const hwy::float16_t*>(src), \
-            srcStride, reinterpret_cast<hwy::float16_t*>(dst), dstStride, width, height, kernelSize, sigma); \
-    }
-
-GAUSSIAN_BLUR_DECLARATION_R_F16(RGBAF16, CHANNELS_4)
-GAUSSIAN_BLUR_DECLARATION_R_F16(RGBF16, CHANNELS_3)
-GAUSSIAN_BLUR_DECLARATION_R_F16(ChannelF16, CHANNEL)
-
-#undef GAUSSIAN_BLUR_DECLARATION_R_F16
-
-}
-HWY_AFTER_NAMESPACE();
-
-#endif
\ No newline at end of file
diff --git a/src/GaussianBlur.cpp b/src/GaussianBlur.cpp
deleted file mode 100644
index 3a7c46d..0000000
--- a/src/GaussianBlur.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sparkyuv.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "src/GaussianBlur.cpp"
-
-#include "hwy/foreach_target.h"
-#include "hwy/highway.h"
-#include "yuv-inl.h"
-#include "GaussianBlur-inl.h"
-
-#if HWY_ONCE
-namespace sparkyuv {
-#define GAUSSIAN_BLUR_DECLARE_EXPORT(srcPixel) HWY_EXPORT(GaussianBlur##srcPixel##HWY);
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGBA)
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGB)
-GAUSSIAN_BLUR_DECLARE_EXPORT(Channel)
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGBA16)
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGB16)
-GAUSSIAN_BLUR_DECLARE_EXPORT(Channel16)
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGBAF16)
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGBF16)
-GAUSSIAN_BLUR_DECLARE_EXPORT(ChannelF16)
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGBAF32)
-GAUSSIAN_BLUR_DECLARE_EXPORT(RGBF32)
-GAUSSIAN_BLUR_DECLARE_EXPORT(ChannelF32)
-#undef GAUSSIAN_BLUR_DECLARE_EXPORT
-
-#define GAUSSIAN_BLUR_DECLARATION_E(srcPixel, storageType) \
-    void GaussianBlur##srcPixel(const storageType *SPARKYUV_RESTRICT src, const uint32_t srcStride,\
-                                storageType *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\
-                                const uint32_t width, const uint32_t height,  \
-                                const int kernelSize, const float sigma) {\
-        HWY_DYNAMIC_DISPATCH(GaussianBlur##srcPixel##HWY)(src, srcStride, dst, dstStride,\
-                                                          width, height, kernelSize, sigma); \
-    }
-
-GAUSSIAN_BLUR_DECLARATION_E(RGBA, uint8_t)
-GAUSSIAN_BLUR_DECLARATION_E(RGB, uint8_t)
-GAUSSIAN_BLUR_DECLARATION_E(Channel, uint8_t)
-
-GAUSSIAN_BLUR_DECLARATION_E(RGBA16, uint16_t)
-GAUSSIAN_BLUR_DECLARATION_E(RGB16, uint16_t)
-GAUSSIAN_BLUR_DECLARATION_E(Channel16, uint16_t)
-
-GAUSSIAN_BLUR_DECLARATION_E(RGBAF32, float)
-GAUSSIAN_BLUR_DECLARATION_E(RGBF32, float)
-GAUSSIAN_BLUR_DECLARATION_E(ChannelF32, float)
-
-#undef GAUSSIAN_BLUR_DECLARATION_E
-
-#define GAUSSIAN_BLUR_DECLARATION_R_F16(srcPixel, surfaceType) \
-    void GaussianBlur##srcPixel(const uint16_t *SPARKYUV_RESTRICT src, const uint32_t srcStride,\
-                                uint16_t *SPARKYUV_RESTRICT dst, const uint32_t dstStride,\
-                                const uint32_t width, const uint32_t height,  \
-                                const int kernelSize, const float sigma) {\
-        HWY_DYNAMIC_DISPATCH(GaussianBlur##srcPixel##HWY)(src, srcStride, dst, dstStride, \
-          width, height, kernelSize, sigma); \
-    }
-
-GAUSSIAN_BLUR_DECLARATION_R_F16(RGBAF16, CHANNELS_4)
-GAUSSIAN_BLUR_DECLARATION_R_F16(RGBF16, CHANNELS_3)
-GAUSSIAN_BLUR_DECLARATION_R_F16(ChannelF16, CHANNEL)
-
-#undef GAUSSIAN_BLUR_DECLARATION_R_F16
-}
-#endif
\ No newline at end of file
diff --git a/src/Scale.cpp b/src/Scale.cpp
deleted file mode 100644
index 2cb8a94..0000000
--- a/src/Scale.cpp
+++ /dev/null
@@ -1,729 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "src/Scale.cpp"
-
-#include "hwy/foreach_target.h"
-#include "hwy/highway.h"
-#include "yuv-inl.h"
-#include "sampler/NearestRowSampler-inl.hpp"
-#include "sampler/BilinearRowSampler-inl.hpp"
-#include "sampler/Window4RowSampler-inl.hpp"
-#include "sampler/Window6RowSampler-inl.hpp"
-#include "sampler/BoxRowSampler-inl.h"
-#include "concurrency.hpp"
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-
-template<int Components>
-void ScaleRGB16OrChannelHWY(const uint16_t *input, const uint32_t srcStride,
-                            uint32_t inputWidth, uint32_t inputHeight,
-                            uint16_t *output, const uint32_t dstStride,
-                            uint32_t outputWidth, uint32_t outputHeight,
-                            const int depth, const SparkYuvSampler option) {
-
-  auto src = reinterpret_cast<const uint16_t *>(input);
-
-  std::unique_ptr<ScaleRowSampler<uint16_t>> sampler;
-  switch (option) {
-    case box: {
-      sampler = std::make_unique<BoxSampler<uint16_t, sparkyuv::BOX_UINT16, Components>>(src,
-                                                                                         srcStride,
-                                                                                         inputWidth,
-                                                                                         inputHeight,
-                                                                                         output,
-                                                                                         dstStride,
-                                                                                         outputWidth,
-                                                                                         outputHeight);
-    }
-      break;
-    case hermite: {
-      sampler = std::make_unique<WeightedWindow4RowSampler16Bit<WEIGHTED_ROW4_HERMITE, Components>>(src,
-                                                                                                    srcStride,
-                                                                                                    inputWidth,
-                                                                                                    inputHeight,
-                                                                                                    output,
-                                                                                                    dstStride,
-                                                                                                    outputWidth,
-                                                                                                    outputHeight,
-                                                                                                    depth);
-    }
-      break;
-    case catmullRom: {
-      sampler = std::make_unique<WeightedWindow4RowSampler16Bit<WEIGHTED_ROW4_CATMULL_ROM, Components>>(src,
-                                                                                                        srcStride,
-                                                                                                        inputWidth,
-                                                                                                        inputHeight,
-                                                                                                        output,
-                                                                                                        dstStride,
-                                                                                                        outputWidth,
-                                                                                                        outputHeight,
-                                                                                                        depth);
-    }
-      break;
-    case bSpline: {
-      sampler = std::make_unique<WeightedWindow4RowSampler16Bit<WEIGHTED_ROW4_BSPLINE, Components>>(src,
-                                                                                                    srcStride,
-                                                                                                    inputWidth,
-                                                                                                    inputHeight,
-                                                                                                    output,
-                                                                                                    dstStride,
-                                                                                                    outputWidth,
-                                                                                                    outputHeight,
-                                                                                                    depth);
-    }
-      break;
-    case cubic: {
-      sampler = std::make_unique<WeightedWindow4RowSampler16Bit<WEIGHTED_ROW4_CUBIC, Components>>(src,
-                                                                                                  srcStride,
-                                                                                                  inputWidth,
-                                                                                                  inputHeight,
-                                                                                                  output,
-                                                                                                  dstStride,
-                                                                                                  outputWidth,
-                                                                                                  outputHeight, depth);
-    }
-      break;
-    case bicubic: {
-      sampler = std::make_unique<WeightedWindow4RowSampler16Bit<WEIGHTED_ROW4_BICUBIC, Components>>(src,
-                                                                                                    srcStride,
-                                                                                                    inputWidth,
-                                                                                                    inputHeight,
-                                                                                                    output,
-                                                                                                    dstStride,
-                                                                                                    outputWidth,
-                                                                                                    outputHeight,
-                                                                                                    depth);
-    }
-      break;
-    case mitchell: {
-      sampler = std::make_unique<WeightedWindow4RowSampler16Bit<WEIGHTED_ROW4_MITCHELL, Components>>(src,
-                                                                                                     srcStride,
-                                                                                                     inputWidth,
-                                                                                                     inputHeight,
-                                                                                                     output,
-                                                                                                     dstStride,
-                                                                                                     outputWidth,
-                                                                                                     outputHeight,
-                                                                                                     depth);
-    }
-      break;
-    case lanczos: {
-      sampler = std::make_unique<WeightedWindow6RowSampler16Bit<WEIGHTED_ROW6_LANCZOS_SINC, Components>>(src,
-                                                                                                         srcStride,
-                                                                                                         inputWidth,
-                                                                                                         inputHeight,
-                                                                                                         output,
-                                                                                                         dstStride,
-                                                                                                         outputWidth,
-                                                                                                         outputHeight,
-                                                                                                         depth);
-    }
-      break;
-    case bilinear: {
-      sampler = std::make_unique<BilinearRowSamplerAnyBit<uint16_t, Components>>(src,
-                                                                                 srcStride,
-                                                                                 inputWidth,
-                                                                                 inputHeight,
-                                                                                 output,
-                                                                                 dstStride,
-                                                                                 outputWidth,
-                                                                                 outputHeight);
-    }
-      break;
-    default: {
-      sampler = std::make_unique<NearestRowSampler16Bit<Components>>(src, srcStride,
-                                                                     inputWidth,
-                                                                     inputHeight,
-                                                                     output,
-                                                                     dstStride,
-                                                                     outputWidth,
-                                                                     outputHeight);
-    }
-      break;
-  }
-
-  const int threadCount = std::clamp(std::min(static_cast<int>(std::thread::hardware_concurrency()),
-                                              static_cast<int>(outputHeight * outputWidth / (256 * 256))),
-                                     static_cast<int>(1), static_cast<int>(12));
-
-  concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) {
-    sampler->sample(iterationId);
-  });
-}
-
-template<int Components>
-void ScaleRGB8OrChannelHWY(const uint8_t *input,
-                           const uint32_t srcStride,
-                           uint32_t inputWidth, uint32_t inputHeight,
-                           uint8_t *output,
-                           const uint32_t dstStride,
-                           uint32_t outputWidth, uint32_t outputHeight,
-                           const SparkYuvSampler option) {
-
-  auto src8 = reinterpret_cast<const uint8_t *>(input);
-
-  std::unique_ptr<ScaleRowSampler<uint8_t>> sampler;
-  switch (option) {
-    case box: {
-      sampler = std::make_unique<BoxSampler<uint8_t, sparkyuv::BOX_UINT8, Components>>(src8,
-                                                                                       srcStride,
-                                                                                       inputWidth,
-                                                                                       inputHeight,
-                                                                                       output,
-                                                                                       dstStride,
-                                                                                       outputWidth,
-                                                                                       outputHeight);
-    }
-      break;
-    case hermite: {
-      sampler = std::make_unique<WeightedWindow4RowSampler<WEIGHTED_ROW4_HERMITE, Components>>(src8,
-                                                                                               srcStride,
-                                                                                               inputWidth,
-                                                                                               inputHeight,
-                                                                                               output,
-                                                                                               dstStride,
-                                                                                               outputWidth,
-                                                                                               outputHeight);
-    }
-      break;
-    case catmullRom: {
-      sampler = std::make_unique<WeightedWindow4RowSampler<WEIGHTED_ROW4_CATMULL_ROM, Components>>(src8,
-                                                                                                   srcStride,
-                                                                                                   inputWidth,
-                                                                                                   inputHeight,
-                                                                                                   output,
-                                                                                                   dstStride,
-                                                                                                   outputWidth,
-                                                                                                   outputHeight);
-    }
-      break;
-    case bSpline: {
-      sampler = std::make_unique<WeightedWindow4RowSampler<WEIGHTED_ROW4_BSPLINE, Components>>(src8,
-                                                                                               srcStride,
-                                                                                               inputWidth,
-                                                                                               inputHeight,
-                                                                                               output,
-                                                                                               dstStride,
-                                                                                               outputWidth,
-                                                                                               outputHeight);
-    }
-      break;
-    case cubic: {
-      sampler = std::make_unique<WeightedWindow4RowSampler<WEIGHTED_ROW4_CUBIC, Components>>(src8,
-                                                                                             srcStride,
-                                                                                             inputWidth,
-                                                                                             inputHeight,
-                                                                                             output,
-                                                                                             dstStride,
-                                                                                             outputWidth,
-                                                                                             outputHeight);
-    }
-      break;
-    case bicubic: {
-      sampler = std::make_unique<WeightedWindow4RowSampler<WEIGHTED_ROW4_BICUBIC, Components>>(src8,
-                                                                                               srcStride,
-                                                                                               inputWidth,
-                                                                                               inputHeight,
-                                                                                               output,
-                                                                                               dstStride,
-                                                                                               outputWidth,
-                                                                                               outputHeight);
-    }
-      break;
-    case mitchell: {
-      sampler = std::make_unique<WeightedWindow4RowSampler<WEIGHTED_ROW4_MITCHELL, Components>>(src8,
-                                                                                                srcStride,
-                                                                                                inputWidth,
-                                                                                                inputHeight,
-                                                                                                output,
-                                                                                                dstStride,
-                                                                                                outputWidth,
-                                                                                                outputHeight);
-    }
-      break;
-    case lanczos: {
-      sampler = std::make_unique<WeightedWindow6RowSampler<WEIGHTED_ROW6_LANCZOS_SINC, Components>>(src8,
-                                                                                                    srcStride,
-                                                                                                    inputWidth,
-                                                                                                    inputHeight,
-                                                                                                    output,
-                                                                                                    dstStride,
-                                                                                                    outputWidth,
-                                                                                                    outputHeight);
-    }
-      break;
-    case bilinear: {
-      if (Components == 4) {
-        sampler = std::make_unique<BilinearRowSampler4Chan8Bit<Components>>(src8,
-                                                                            srcStride,
-                                                                            inputWidth,
-                                                                            inputHeight,
-                                                                            output,
-                                                                            dstStride,
-                                                                            outputWidth,
-                                                                            outputHeight);
-      } else {
-        sampler = std::make_unique<BilinearRowSamplerAnyBit<uint8_t, Components>>(src8,
-                                                                                  srcStride,
-                                                                                  inputWidth,
-                                                                                  inputHeight,
-                                                                                  output,
-                                                                                  dstStride,
-                                                                                  outputWidth,
-                                                                                  outputHeight);
-      }
-    }
-      break;
-    default: {
-      sampler = std::make_unique<NearestRowSampler<Components>>(src8, srcStride,
-                                                                inputWidth,
-                                                                inputHeight,
-                                                                output,
-                                                                dstStride,
-                                                                outputWidth,
-                                                                outputHeight);
-    }
-      break;
-  }
-
-  const int threadCount = std::clamp(std::min(static_cast<int>(std::thread::hardware_concurrency()),
-                                              static_cast<int>(outputHeight * outputWidth / (256 * 256))),
-                                     static_cast<int>(1), static_cast<int>(12));
-
-  concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) {
-    sampler->sample(iterationId);
-  });
-}
-
-void ScaleRGB1010102OrChannelHWY(const uint8_t *input,
-                                 const uint32_t srcStride,
-                                 uint32_t inputWidth, uint32_t inputHeight,
-                                 uint8_t *output,
-                                 const uint32_t dstStride,
-                                 uint32_t outputWidth, uint32_t outputHeight,
-                                 const SparkYuvSampler option) {
-
-  auto src8 = reinterpret_cast<const uint8_t *>(input);
-
-  std::unique_ptr<ScaleRowSampler<uint32_t>> sampler;
-  switch (option) {
-    case box: {
-      sampler =
-          std::make_unique<BoxSampler<uint32_t, sparkyuv::BOX_RGBA1010102, 1>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                               srcStride,
-                                                                               inputWidth,
-                                                                               inputHeight,
-                                                                               reinterpret_cast<uint32_t *>(output),
-                                                                               dstStride,
-                                                                               outputWidth,
-                                                                               outputHeight);
-    }
-      break;
-    case hermite: {
-      sampler =
-          std::make_unique<WeightedWindow4RowSampler10Bit<WEIGHTED_ROW4_HERMITE>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                                  srcStride,
-                                                                                  inputWidth,
-                                                                                  inputHeight,
-                                                                                  reinterpret_cast<uint32_t *>(output),
-                                                                                  dstStride,
-                                                                                  outputWidth,
-                                                                                  outputHeight);
-    }
-      break;
-    case catmullRom: {
-      sampler =
-          std::make_unique<WeightedWindow4RowSampler10Bit<WEIGHTED_ROW4_CATMULL_ROM>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                                      srcStride,
-                                                                                      inputWidth,
-                                                                                      inputHeight,
-                                                                                      reinterpret_cast<uint32_t *>(output),
-                                                                                      dstStride,
-                                                                                      outputWidth,
-                                                                                      outputHeight);
-    }
-      break;
-    case bSpline: {
-      sampler =
-          std::make_unique<WeightedWindow4RowSampler10Bit<WEIGHTED_ROW4_BSPLINE>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                                  srcStride,
-                                                                                  inputWidth,
-                                                                                  inputHeight,
-                                                                                  reinterpret_cast<uint32_t *>(output),
-                                                                                  dstStride,
-                                                                                  outputWidth,
-                                                                                  outputHeight);
-    }
-      break;
-    case cubic: {
-      sampler =
-          std::make_unique<WeightedWindow4RowSampler10Bit<WEIGHTED_ROW4_CUBIC>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                                srcStride,
-                                                                                inputWidth,
-                                                                                inputHeight,
-                                                                                reinterpret_cast<uint32_t *>(output),
-                                                                                dstStride,
-                                                                                outputWidth,
-                                                                                outputHeight);
-    }
-      break;
-    case bicubic: {
-      sampler =
-          std::make_unique<WeightedWindow4RowSampler10Bit<WEIGHTED_ROW4_BICUBIC>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                                  srcStride,
-                                                                                  inputWidth,
-                                                                                  inputHeight,
-                                                                                  reinterpret_cast<uint32_t *>(output),
-                                                                                  dstStride,
-                                                                                  outputWidth,
-                                                                                  outputHeight);
-    }
-      break;
-    case mitchell: {
-      sampler =
-          std::make_unique<WeightedWindow4RowSampler10Bit<WEIGHTED_ROW4_MITCHELL>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                                   srcStride,
-                                                                                   inputWidth,
-                                                                                   inputHeight,
-                                                                                   reinterpret_cast<uint32_t *>(output),
-                                                                                   dstStride,
-                                                                                   outputWidth,
-                                                                                   outputHeight);
-    }
-      break;
-    case lanczos: {
-      sampler =
-          std::make_unique<WeightedWindow6RowSampler10Bit<WEIGHTED_ROW6_LANCZOS_SINC>>(reinterpret_cast<const uint32_t *>(src8),
-                                                                                       srcStride,
-                                                                                       inputWidth,
-                                                                                       inputHeight,
-                                                                                       reinterpret_cast<uint32_t *>(output),
-                                                                                       dstStride,
-                                                                                       outputWidth,
-                                                                                       outputHeight);
-    }
-      break;
-    case bilinear: {
-      sampler = std::make_unique<BilinearRowSampler10Bit>(reinterpret_cast<const uint32_t *>(src8),
-                                                          srcStride,
-                                                          inputWidth,
-                                                          inputHeight,
-                                                          reinterpret_cast<uint32_t *>(output),
-                                                          dstStride,
-                                                          outputWidth,
-                                                          outputHeight);
-    }
-      break;
-    default: {
-      sampler = std::make_unique<NearestRowSampler10Bit>(reinterpret_cast<const uint32_t *>(src8),
-                                                         srcStride,
-                                                         inputWidth,
-                                                         inputHeight,
-                                                         reinterpret_cast<uint32_t *>(output),
-                                                         dstStride,
-                                                         outputWidth,
-                                                         outputHeight);
-    }
-      break;
-  }
-
-  const int threadCount = std::clamp(std::min(static_cast<int>(std::thread::hardware_concurrency()),
-                                              static_cast<int>(outputHeight * outputWidth / (256 * 256))),
-                                     static_cast<int>(1), static_cast<int>(12));
-
-  concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) {
-    sampler->sample(iterationId);
-  });
-}
-
-template<int Components>
-void ScaleRGB16FOrChannelHWY(const uint16_t *input,
-                             const uint32_t srcStride,
-                             uint32_t inputWidth, uint32_t inputHeight,
-                             uint16_t *output, const uint32_t dstStride,
-                             uint32_t outputWidth, uint32_t outputHeight,
-                             const SparkYuvSampler option) {
-
-  auto src8 = reinterpret_cast<const uint16_t *>(input);
-
-  std::unique_ptr<ScaleRowSampler<uint16_t>> sampler;
-  switch (option) {
-    case box: {
-      sampler = std::make_unique<BoxSampler<uint16_t, sparkyuv::BOX_FLOAT16, Components>>(src8,
-                                                                                          srcStride,
-                                                                                          inputWidth,
-                                                                                          inputHeight,
-                                                                                          output,
-                                                                                          dstStride,
-                                                                                          outputWidth,
-                                                                                          outputHeight);
-    }
-      break;
-    case hermite: {
-      sampler = std::make_unique<WeightedWindow4RowSamplerF16Bit<WEIGHTED_ROW4_HERMITE, Components>>(src8,
-                                                                                                     srcStride,
-                                                                                                     inputWidth,
-                                                                                                     inputHeight,
-                                                                                                     output,
-                                                                                                     dstStride,
-                                                                                                     outputWidth,
-                                                                                                     outputHeight);
-    }
-      break;
-    case catmullRom: {
-      sampler = std::make_unique<WeightedWindow4RowSamplerF16Bit<WEIGHTED_ROW4_CATMULL_ROM, Components>>(src8,
-                                                                                                         srcStride,
-                                                                                                         inputWidth,
-                                                                                                         inputHeight,
-                                                                                                         output,
-                                                                                                         dstStride,
-                                                                                                         outputWidth,
-                                                                                                         outputHeight);
-    }
-      break;
-    case bSpline: {
-      sampler = std::make_unique<WeightedWindow4RowSamplerF16Bit<WEIGHTED_ROW4_BSPLINE, Components>>(src8,
-                                                                                                     srcStride,
-                                                                                                     inputWidth,
-                                                                                                     inputHeight,
-                                                                                                     output,
-                                                                                                     dstStride,
-                                                                                                     outputWidth,
-                                                                                                     outputHeight);
-    }
-      break;
-    case cubic: {
-      sampler = std::make_unique<WeightedWindow4RowSamplerF16Bit<WEIGHTED_ROW4_CUBIC, Components>>(src8,
-                                                                                                   srcStride,
-                                                                                                   inputWidth,
-                                                                                                   inputHeight,
-                                                                                                   output,
-                                                                                                   dstStride,
-                                                                                                   outputWidth,
-                                                                                                   outputHeight);
-    }
-      break;
-    case bicubic: {
-      sampler = std::make_unique<WeightedWindow4RowSamplerF16Bit<WEIGHTED_ROW4_BICUBIC, Components>>(src8,
-                                                                                                     srcStride,
-                                                                                                     inputWidth,
-                                                                                                     inputHeight,
-                                                                                                     output,
-                                                                                                     dstStride,
-                                                                                                     outputWidth,
-                                                                                                     outputHeight);
-    }
-      break;
-    case mitchell: {
-      sampler = std::make_unique<WeightedWindow4RowSamplerF16Bit<WEIGHTED_ROW4_MITCHELL, Components>>(src8,
-                                                                                                      srcStride,
-                                                                                                      inputWidth,
-                                                                                                      inputHeight,
-                                                                                                      output,
-                                                                                                      dstStride,
-                                                                                                      outputWidth,
-                                                                                                      outputHeight);
-    }
-      break;
-    case lanczos: {
-      sampler = std::make_unique<WeightedWindow6RowSamplerF16Bit<WEIGHTED_ROW6_LANCZOS_SINC, Components>>(src8,
-                                                                                                          srcStride,
-                                                                                                          inputWidth,
-                                                                                                          inputHeight,
-                                                                                                          output,
-                                                                                                          dstStride,
-                                                                                                          outputWidth,
-                                                                                                          outputHeight);
-    }
-      break;
-    case bilinear: {
-      sampler = std::make_unique<BilinearRowSamplerF16Bit<Components>>(src8,
-                                                                       srcStride,
-                                                                       inputWidth,
-                                                                       inputHeight,
-                                                                       output,
-                                                                       dstStride,
-                                                                       outputWidth,
-                                                                       outputHeight);
-    }
-      break;
-    default: {
-      sampler = std::make_unique<NearestRowSampler16Bit<Components>>(src8, srcStride,
-                                                                     inputWidth,
-                                                                     inputHeight,
-                                                                     output,
-                                                                     dstStride,
-                                                                     outputWidth,
-                                                                     outputHeight);
-    }
-      break;
-  }
-
-  const int threadCount = std::clamp(std::min(static_cast<int>(std::thread::hardware_concurrency()),
-                                              static_cast<int>(outputHeight * outputWidth / (256 * 256))),
-                                     static_cast<int>(1), static_cast<int>(12));
-
-  concurrency::parallel_for(threadCount, outputHeight, [&](int iterationId) {
-    sampler->sample(iterationId);
-  });
-}
-
-#define SCALE_CHANNEL_16_TYPE(channelName, channelsCount) \
-      void Scale##channelName##HWY(const uint16_t *input, const uint32_t srcStride,\
-                                   uint32_t inputWidth, uint32_t inputHeight,\
-                                   uint16_t *output, const uint32_t dstStride,\
-                                   uint32_t outputWidth, uint32_t outputHeight,    \
-                                   const int depth, const SparkYuvSampler option) {\
-      ScaleRGB16OrChannelHWY<channelsCount>(input, srcStride, inputWidth, inputHeight, output, dstStride, outputWidth, outputHeight, depth, option);\
-      }
-
-SCALE_CHANNEL_16_TYPE(Channel16, 1)
-SCALE_CHANNEL_16_TYPE(RGB16, 3)
-SCALE_CHANNEL_16_TYPE(RGBA16, 4)
-
-#undef SCALE_CHANNEL_16_TYPE
-
-#define SCALE_CHANNEL_TYPE(channelName, channelsCount) \
-      void Scale##channelName##HWY(const uint8_t *input, const uint32_t srcStride,\
-                                   uint32_t inputWidth, uint32_t inputHeight,\
-                                   uint8_t *output,\
-                                   const uint32_t dstStride,\
-                                   uint32_t outputWidth, uint32_t outputHeight,\
-                                   const SparkYuvSampler option) {\
-      ScaleRGB8OrChannelHWY<channelsCount>(input, srcStride, inputWidth, inputHeight, output, dstStride, outputWidth, outputHeight, option);\
-      }
-
-SCALE_CHANNEL_TYPE(Channel, 1)
-SCALE_CHANNEL_TYPE(RGB, 3)
-SCALE_CHANNEL_TYPE(RGBA, 4)
-
-#undef SCALE_CHANNEL_TYPE
-
-void ScaleRGBA1010102HWY(const uint8_t *input, const uint32_t srcStride,
-                         uint32_t inputWidth, uint32_t inputHeight,
-                         uint8_t *output,
-                         const uint32_t dstStride,
-                         uint32_t outputWidth, uint32_t outputHeight,
-                         const SparkYuvSampler option) {
-  ScaleRGB1010102OrChannelHWY(input, srcStride,
-                              inputWidth, inputHeight,
-                              output, dstStride, outputWidth, outputHeight, option);
-
-}
-
-#define SCALE_CHANNEL_F16_TYPE(channelName, channelsCount) \
-      void Scale##channelName##HWY(const uint16_t *input, const uint32_t srcStride,\
-                                   uint32_t inputWidth, uint32_t inputHeight,\
-                                   uint16_t *output,\
-                                   const uint32_t dstStride,\
-                                   uint32_t outputWidth, uint32_t outputHeight,\
-                                   const SparkYuvSampler option) {\
-      ScaleRGB16FOrChannelHWY<channelsCount>(input, srcStride, inputWidth, inputHeight, output, dstStride, outputWidth, outputHeight, option);\
-      }
-
-SCALE_CHANNEL_F16_TYPE(ChannelF16, 1)
-SCALE_CHANNEL_F16_TYPE(RGBF16, 3)
-SCALE_CHANNEL_F16_TYPE(RGBAF16, 4)
-
-#undef SCALE_CHANNEL_F16_TYPE
-}
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace sparkyuv {
-#define SCALE_CHANNEL_TYPE_DECLARE_HWY(channelName) HWY_EXPORT(Scale##channelName##HWY);
-
-SCALE_CHANNEL_TYPE_DECLARE_HWY(Channel)
-SCALE_CHANNEL_TYPE_DECLARE_HWY(RGB)
-SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBA)
-
-SCALE_CHANNEL_TYPE_DECLARE_HWY(Channel16)
-SCALE_CHANNEL_TYPE_DECLARE_HWY(RGB16)
-SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBA16)
-
-SCALE_CHANNEL_TYPE_DECLARE_HWY(ChannelF16)
-SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBF16)
-SCALE_CHANNEL_TYPE_DECLARE_HWY(RGBAF16)
-
-#undef SCALE_CHANNEL_TYPE_DECLARE_HWY
-
-#define SCALE_CHANNEL_DECLARE_E(channelName, channelsCount) \
-      HWY_DLLEXPORT void Scale##channelName(const uint8_t *input, const uint32_t srcStride,\
-                              uint32_t inputWidth, uint32_t inputHeight,\
-                              uint8_t *output,\
-                              const uint32_t dstStride,\
-                              uint32_t outputWidth, uint32_t outputHeight,\
-                              const SparkYuvSampler option) {\
-        HWY_DYNAMIC_DISPATCH(Scale##channelName##HWY)(input, srcStride, inputWidth, inputHeight, \
-                              output, dstStride, outputWidth, outputHeight, option);\
-      }
-
-SCALE_CHANNEL_DECLARE_E(Channel, 1)
-SCALE_CHANNEL_DECLARE_E(RGB, 3)
-SCALE_CHANNEL_DECLARE_E(RGBA, 4)
-
-#undef SCALE_CHANNEL_DECLARE_E
-
-#define SCALE_CHANNEL_F16_DECLARE_E(channelName, channelsCount) \
-      HWY_DLLEXPORT void Scale##channelName(const uint16_t *input, const uint32_t srcStride,\
-                              uint32_t inputWidth, uint32_t inputHeight,\
-                              uint16_t *output, const uint32_t dstStride,\
-                              uint32_t outputWidth, uint32_t outputHeight,\
-                              const SparkYuvSampler option) {\
-        HWY_DYNAMIC_DISPATCH(Scale##channelName##HWY)(input, srcStride, inputWidth, inputHeight, \
-                              output, dstStride, outputWidth, outputHeight, option);\
-      }
-
-SCALE_CHANNEL_F16_DECLARE_E(ChannelF16, 1)
-SCALE_CHANNEL_F16_DECLARE_E(RGBF16, 3)
-SCALE_CHANNEL_F16_DECLARE_E(RGBAF16, 4)
-
-#undef SCALE_CHANNEL_F16_DECLARE_E
-
-HWY_EXPORT(ScaleRGBA1010102HWY);
-
-HWY_DLLEXPORT void ScaleRGBA1010102(const uint8_t *input, const uint32_t srcStride,
-                                    uint32_t inputWidth, uint32_t inputHeight,
-                                    uint8_t *output, const uint32_t dstStride,
-                                    uint32_t outputWidth, uint32_t outputHeight,
-                                    const SparkYuvSampler option) {
-  HWY_DYNAMIC_DISPATCH(ScaleRGBA1010102HWY)(input, srcStride,
-                                            inputWidth, inputHeight,
-                                            output, dstStride, outputWidth, outputHeight, option);
-
-}
-
-#define SCALE_CHANNEL_16_TYPE_E(channelName) \
-      void Scale##channelName(const uint16_t *input, const uint32_t srcStride,\
-                              uint32_t inputWidth, uint32_t inputHeight,\
-                              uint16_t *output, const uint32_t dstStride,\
-                              uint32_t outputWidth, uint32_t outputHeight,    \
-                              const int depth, const SparkYuvSampler option) {\
-        HWY_DYNAMIC_DISPATCH(Scale##channelName##HWY)(input, srcStride, inputWidth, inputHeight, output, dstStride, \
-                outputWidth, outputHeight, depth, option);\
-      }
-
-SCALE_CHANNEL_16_TYPE_E(Channel16)
-SCALE_CHANNEL_16_TYPE_E(RGB16)
-SCALE_CHANNEL_16_TYPE_E(RGBA16)
-
-#undef SCALE_CHANNEL_16_TYPE
-
-}
-#endif
\ No newline at end of file
diff --git a/src/concurrency.hpp b/src/concurrency.hpp
index e27c0e1..5a023fa 100644
--- a/src/concurrency.hpp
+++ b/src/concurrency.hpp
@@ -152,7 +152,7 @@ void parallel_for_segment(const int numThreads, const uint32_t numIterations, Fu
 #if THREADS_SUPPORTED
   std::vector<std::thread> threads;
 
-  int segmentHeight = numIterations / static_cast<uint32_t >(numThreads);
+  auto segmentHeight = static_cast<uint32_t >(numIterations) / static_cast<uint32_t >(numThreads);
 
   auto parallelWorker = [&](int start, int end) {
     std::invoke(func, start, end, std::forward<Args>(args)...);
@@ -170,8 +170,8 @@ void parallel_for_segment(const int numThreads, const uint32_t numIterations, Fu
     }
   }
 
-  int start = 0;
-  int end = segmentHeight;
+  uint32_t start = 0;
+  uint32_t end = segmentHeight;
   if (numThreads == 1) {
     end = numIterations;
   }
diff --git a/src/math/gaussian.h b/src/math/gaussian.h
deleted file mode 100644
index 984faae..0000000
--- a/src/math/gaussian.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef YUV_SRC_MATH_GAUSSIAN_H_
-#define YUV_SRC_MATH_GAUSSIAN_H_
-
-#include <vector>
-#include <cmath>
-#include <algorithm>
-
-#ifdef _MSC_VER
-#define _USE_MATH_DEFINES
-#include <cmath>
-#endif
-#include <cmath>
-
-#ifndef M_PI_F
-#define M_PI_F 3.14159265358979323846
-#endif
-
-namespace {
-using namespace std;
-vector<float> Get1DGaussianKernel(int width, float sigma) {
-  vector<float> kernel(ceil(width));
-  int mean = width / 2;
-  float sum = 0.f;
-  const float scale = 1.f / (::sqrtf(2.f * M_PI_F) * sigma);
-  for (int x = 0; x < width; x++) {
-    kernel[x] = ::expf(-0.5f * ::powf(static_cast<float>(x - mean) / sigma, 2.0f)) * scale;
-    sum += kernel[x];
-  }
-  if (sum != 0.f) {
-    for (int x = 0; x < width; x++)
-      kernel[x] /= sum;
-  }
-  return std::move(kernel);
-}
-}
-
-#endif //YUV_SRC_MATH_GAUSSIAN_H_
diff --git a/src/sampler/BilinearRowSampler-inl.hpp b/src/sampler/BilinearRowSampler-inl.hpp
deleted file mode 100644
index 7ad91f3..0000000
--- a/src/sampler/BilinearRowSampler-inl.hpp
+++ /dev/null
@@ -1,481 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(SPARKYUV_BILINEAR_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE)
-#ifdef SPARKYUV_BILINEAR_ROW_SAMPLER
-#undef SPARKYUV_BILINEAR_ROW_SAMPLER
-#else
-#define SPARKYUV_BILINEAR_ROW_SAMPLER
-#endif
-
-#include <hwy/highway.h>
-#include "ScaleRowSampler.hpp"
-#include "../yuv-inl.h"
-#include "sampler.h"
-#include <cstdint>
-#include <algorithm>
-#include <cmath>
-
-#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
-#define BILINEAR_ENABLE_HWY 1
-#else
-#define BILINEAR_ENABLE_HWY 0
-#endif
-
-#if BILINEAR_ENABLE_HWY
-#include "sampler-inl.h"
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-using namespace sparkyuv;
-using namespace hwy;
-using namespace hwy::HWY_NAMESPACE;
-
-template<int Components>
-class BilinearRowSampler4Chan8Bit : public ScaleRowSampler<uint8_t> {
- public:
-  BilinearRowSampler4Chan8Bit(const uint8_t *mSource,
-                              const int srcStride,
-                              const int inputWidth,
-                              const int inputHeight,
-                              uint8_t *mDestination,
-                              const int dstStride,
-                              const int outputWidth,
-                              const int outputHeight) :
-      ScaleRowSampler<uint8_t>(mSource,
-                               srcStride,
-                               inputWidth,
-                               inputHeight,
-                               mDestination,
-                               dstStride,
-                               outputWidth,
-                               outputHeight) {
-
-  }
-
-  ~BilinearRowSampler4Chan8Bit() override = default;
-
-  void sample(const int row) override {
-#if BILINEAR_ENABLE_HWY
-    const FixedTag<float32_t, 4> dfx4;
-    const FixedTag<uint32_t, 4> dix4;
-    const FixedTag<uint8_t, 4> du8x4;
-    using VU8x4 = Vec<decltype(du8x4)>;
-
-    using VI4 = Vec<decltype(dix4)>;
-    using VF4 = Vec<decltype(dfx4)>;
-    const uint32_t shift[4] = {0, 1, 2, 3};
-    const VI4 shiftV = LoadU(dix4, shift);
-    const FixedTag<uint32_t, 4> dux4;
-    const VF4 xScaleV = Set(dfx4, this->xScale);
-    const VF4 yScaleV = Set(dfx4, this->yScale);
-
-    const VI4 maxWidth = Set(dix4, this->inputWidth - 1);
-    const VI4 maxHeight = Set(dix4, this->inputHeight - 1);
-
-    const VI4 addOne = Set(dix4, 1);
-
-    const VF4 vfZeros = Zero(dfx4);
-    const VI4 srcStrideV = Set(dix4, this->srcStride);
-    const VF4 maxColorsV = Set(dfx4, maxColors);
-#endif
-    auto dst8 = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + row * this->dstStride);
-    auto dst = reinterpret_cast<uint8_t *>(dst8);
-
-    const uint8_t *src8 = this->mSource;
-    const int components = Components;
-
-    uint32_t x = 0;
-
-#if BILINEAR_ENABLE_HWY
-#if !NOACCELERATED_SAMPLER
-    for (; x + 8 < this->outputWidth && components == 4; ++x) {
-      VI4 currentX = Set(dix4, x);
-      VI4 currentXV = Add(currentX, shiftV);
-      VF4 currentXVF = Mul(ConvertTo(dfx4, currentXV), xScaleV);
-      VF4 currentYVF = Mul(ConvertTo(dfx4, Set(dix4, row)), yScaleV);
-
-      VI4 xi1 = ConvertTo(dix4, Floor(currentXVF));
-      VI4 yi1 = Min(ConvertTo(dix4, Floor(currentYVF)), maxHeight);
-
-      VI4 xi2 = Min(Add(xi1, addOne), maxWidth);
-      VI4 yi2 = Min(Add(yi1, addOne), maxHeight);
-
-      VF4 dx = Max(Sub(currentXVF, ConvertTo(dfx4, xi1)), vfZeros);
-      VF4 dy = Max(Sub(currentYVF, ConvertTo(dfx4, yi1)), vfZeros);
-
-      VI4 row1Add = Mul(yi1, srcStrideV);
-      VI4 row2Add = Mul(yi2, srcStrideV);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int i = 0; i < 4; i++) {
-        auto row1 = reinterpret_cast<const uint8_t *>(src8 + ExtractLane(row1Add, i));
-        auto row2 = reinterpret_cast<const uint8_t *>(src8 + ExtractLane(row2Add, i));
-
-        VU8x4 lane = LoadU(du8x4, reinterpret_cast<const uint8_t *>(&row1[ExtractLane(xi1, i) * components]));
-        VF4 c1 = PromoteTo(dfx4, lane);
-        lane = LoadU(du8x4, reinterpret_cast<const uint8_t *>(&row1[ExtractLane(xi2, i) * components]));
-        VF4 c2 = PromoteTo(dfx4, lane);
-        lane = LoadU(du8x4, reinterpret_cast<const uint8_t *>(&row2[ExtractLane(xi1, i) * components]));
-        VF4 c3 = PromoteTo(dfx4, lane);
-        lane = LoadU(du8x4, reinterpret_cast<const uint8_t *>(&row2[ExtractLane(xi2, i) * components]));
-        VF4 c4 = PromoteTo(dfx4, lane);
-        VF4 value = Blerp(dfx4, c1, c2, c3, c4, Set(dfx4, ExtractLane(dx, i)),
-                          Set(dfx4, ExtractLane(dy, i)));
-        VF4 sum = Clamp(Round(value), vfZeros, maxColorsV);
-        VU8x4 pixel = DemoteTo(du8x4, ConvertTo(dux4, sum));
-        auto u8Store = &dst[ExtractLane(currentXV, i) * components];
-        StoreU(pixel, du8x4, u8Store);
-      }
-
-      x += components - 1;
-    }
-#endif
-#endif
-
-    for (; x < this->outputWidth; ++x) {
-      const float srcX = (float) x * this->xScale;
-      const float srcY = (float) row * this->yScale;
-
-      const int x1 = static_cast<int>(std::floor(srcX));
-      const int y1 = static_cast<int>(std::floor(srcY));
-
-      int x2 = std::min(x1 + 1, this->inputWidth - 1);
-      int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-      float dx = std::max((float) srcX - (float) x1, 0.0f);
-      float dy = std::max((float) srcY - (float) y1, 0.0f);
-
-      auto row1 = reinterpret_cast<const uint8_t *>(src8 + y1 * this->srcStride);
-      auto row2 = reinterpret_cast<const uint8_t *>(src8 + y2 * this->srcStride);
-
-      for (int c = 0; c < components; ++c) {
-        auto c1 = static_cast<float>(row1[x1 * components + c]);
-        auto c2 = static_cast<float>(row1[x2 * components + c]);
-        auto c3 = static_cast<float>(row2[x1 * components + c]);
-        auto c4 = static_cast<float>(row2[x2 * components + c]);
-
-        float result = blerp(c1, c2, c3, c4, dx, dy);
-        float f = result;
-        f = std::clamp(::roundf(f), 0.0f, maxColors);
-        dst[x * components + c] = static_cast<uint8_t>(f);
-      }
-    }
-  }
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f;
-};
-
-template<int Components>
-class BilinearRowSamplerF16Bit : public ScaleRowSampler<uint16_t> {
- public:
-  BilinearRowSamplerF16Bit(const uint16_t *mSource,
-                           const int srcStride,
-                           const int inputWidth,
-                           const int inputHeight,
-                           uint16_t *mDestination,
-                           const int dstStride,
-                           const int outputWidth,
-                           const int outputHeight) :
-      ScaleRowSampler<uint16_t>(mSource,
-                                srcStride,
-                                inputWidth,
-                                inputHeight,
-                                mDestination,
-                                dstStride,
-                                outputWidth,
-                                outputHeight) {
-
-  }
-
-  ~BilinearRowSamplerF16Bit() override = default;
-
-  void sample(const int y) override {
-#if BILINEAR_ENABLE_HWY
-    const FixedTag<float32_t, 4> dfx4;
-    const FixedTag<int32_t, 4> dix4;
-    const FixedTag<hwy::float16_t, 4> df16x4;
-    using VI4 = Vec<decltype(dix4)>;
-    using VF4 = Vec<decltype(dfx4)>;
-    using VF16x4 = Vec<decltype(df16x4)>;
-
-    const int shift[4] = {0, 1, 2, 3};
-    const VI4 shiftV = LoadU(dix4, shift);
-    const VF4 xScaleV = Set(dfx4, this->xScale);
-    const VF4 yScaleV = Set(dfx4, this->yScale);
-    const VI4 addOne = Set(dix4, 1);
-    const VF4 fOneV = Set(dfx4, 1.0f);
-    const VI4 maxWidth = Set(dix4, this->inputWidth - 1);
-    const VI4 maxHeight = Set(dix4, this->inputHeight - 1);
-    const VF4 vfZeros = Zero(dfx4);
-    const VI4 srcStrideV = Set(dix4, this->srcStride);
-#endif
-
-    const auto src8 = reinterpret_cast<const uint8_t *>(this->mSource);
-    auto dst16 = reinterpret_cast<uint16_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    const int components = Components;
-
-    uint32_t x = 0;
-
-#if BILINEAR_ENABLE_HWY
-#if !NOACCELERATED_SAMPLER
-    for (; x + 8 < this->outputWidth && components == 4; ++x) {
-      VI4 currentX = Set(dix4, x);
-      VI4 currentXV = Add(currentX, shiftV);
-      VF4 currentXVF = Mul(ConvertTo(dfx4, currentXV), xScaleV);
-      VF4 currentYVF = Mul(ConvertTo(dfx4, Set(dix4, y)), yScaleV);
-
-      VI4 xi1 = ConvertTo(dix4, Floor(currentXVF));
-      VI4 yi1 = Min(ConvertTo(dix4, Floor(currentYVF)), maxHeight);
-
-      VI4 xi2 = Min(Add(xi1, addOne), maxWidth);
-      VI4 yi2 = Min(Add(yi1, addOne), maxHeight);
-
-      VI4 row1Add = Mul(yi1, srcStrideV);
-      VI4 row2Add = Mul(yi2, srcStrideV);
-
-      VF4 dx = Max(Sub(currentXVF, ConvertTo(dfx4, xi1)), vfZeros);
-      VF4 dy = Max(Sub(currentYVF, ConvertTo(dfx4, yi1)), vfZeros);
-
-      #if defined(__clang__)
-      #pragma clang loop unroll(full)
-      #endif
-      for (int i = 0; i < 4; i++) {
-        auto row1 = reinterpret_cast<const hwy::float16_t *>(src8 + ExtractLane(row1Add, i));
-        auto row2 = reinterpret_cast<const hwy::float16_t *>(src8 + ExtractLane(row2Add, i));
-        VF16x4 lane = LoadU(df16x4, &row1[ExtractLane(xi1, i) * components]);
-        VF4 c1 = PromoteTo(dfx4, lane);
-        lane = LoadU(df16x4, &row1[ExtractLane(xi2, i) * components]);
-        VF4 c2 = PromoteTo(dfx4, lane);
-        lane = LoadU(df16x4, &row2[ExtractLane(xi1, i) * components]);
-        VF4 c3 = PromoteTo(dfx4, lane);
-        lane = LoadU(df16x4, &row2[ExtractLane(xi2, i) * components]);
-        VF4 c4 = PromoteTo(dfx4, lane);
-        VF4 value = Blerp(dfx4, c1, c2, c3, c4, Set(dfx4, ExtractLane(dx, i)),
-                          Set(dfx4, ExtractLane(dy, i)));
-        VF16x4 pixel = DemoteTo(df16x4, Max(value, vfZeros));
-        auto u8Store = reinterpret_cast<hwy::float16_t *>(&dst16[ExtractLane(currentXV, i) * components]);
-        StoreU(pixel, df16x4, u8Store);
-      }
-
-      x += components - 1;
-    }
-#endif
-#endif
-
-    for (; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      int x1 = static_cast<int>(srcX);
-      int y1 = static_cast<int>(srcY);
-
-      int x2 = std::min(x1 + 1, this->inputWidth - 1);
-      int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-      float dx = static_cast<float>(srcX) - static_cast<float>(x1);
-      float dy = static_cast<float>(srcY) - static_cast<float>(y1);
-
-      auto row1 = reinterpret_cast<const uint16_t *>(src8 + y1 * this->srcStride);
-      auto row2 = reinterpret_cast<const uint16_t *>(src8 + y2 * this->srcStride);
-
-      const int px = x * components;
-
-      for (int c = 0; c < components; ++c) {
-        float c1 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x1 * components + c]));
-        float c2 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x2 * components + c]));
-        float c3 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x1 * components + c]));
-        float c4 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x2 * components + c]));
-        float result = blerp(c1, c2, c3, c4, dx, dy);
-        dst16[px + c] = hwy::F16FromF32(result).bits;
-      }
-    }
-  }
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f;
-};
-
-template<typename T, int Components>
-class BilinearRowSamplerAnyBit : public ScaleRowSampler<T> {
- public:
-  BilinearRowSamplerAnyBit(const T *mSource,
-                           const int srcStride,
-                           const int inputWidth,
-                           const int inputHeight,
-                           T *mDestination,
-                           const int dstStride,
-                           const int outputWidth,
-                           const int outputHeight) :
-      ScaleRowSampler<T>(mSource,
-                         srcStride,
-                         inputWidth,
-                         inputHeight,
-                         mDestination,
-                         dstStride,
-                         outputWidth,
-                         outputHeight) {
-
-  }
-
-  ~BilinearRowSamplerAnyBit() = default;
-
-  void sample(const int row) {
-    auto dst8 = reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(this->mDestination) + row * this->dstStride);
-    auto dst = reinterpret_cast<T *>(dst8);
-
-    auto *src8 = reinterpret_cast<const uint8_t *>(this->mSource);
-
-    const int components = Components;
-
-    for (int x = 0; x < this->outputWidth; ++x) {
-      const float srcX = (float) x * this->xScale;
-      const float srcY = (float) row * this->yScale;
-
-      const int x1 = static_cast<int>(::floorf(srcX));
-      const int y1 = static_cast<int>(::floorf(srcY));
-
-      int x2 = std::min(x1 + 1, this->inputWidth - 1);
-      int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-      float dx = std::max((float) srcX - (float) x1, 0.0f);
-      float dy = std::max((float) srcY - (float) y1, 0.0f);
-
-      auto row1 = reinterpret_cast<const T *>(src8 + y1 * this->srcStride);
-      auto row2 = reinterpret_cast<const T *>(src8 + y2 * this->srcStride);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int c = 0; c < components; ++c) {
-        auto c1 = static_cast<float>(row1[x1 * components + c]);
-        auto c2 = static_cast<float>(row1[x2 * components + c]);
-        auto c3 = static_cast<float>(row2[x1 * components + c]);
-        auto c4 = static_cast<float>(row2[x2 * components + c]);
-
-        float result = blerp(c1, c2, c3, c4, dx, dy);
-        float f = result;
-        f = std::clamp(::roundf(f), 0.0f, maxColors);
-        dst[0] = static_cast<uint8_t>(f);
-        dst += 1;
-      }
-    }
-  }
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f;
-};
-
-class BilinearRowSampler10Bit : public ScaleRowSampler<uint32_t> {
- public:
-  BilinearRowSampler10Bit(const uint32_t *mSource,
-                          const int srcStride,
-                          const int inputWidth,
-                          const int inputHeight,
-                          uint32_t *mDestination,
-                          const int dstStride,
-                          const int outputWidth,
-                          const int outputHeight) :
-      ScaleRowSampler<uint32_t>(mSource, srcStride, inputWidth, inputHeight,
-                                mDestination, dstStride, outputWidth, outputHeight) {
-
-  }
-
-  ~BilinearRowSampler10Bit() override = default;
-
-  void sample(const int y) override {
-    const auto src8 = reinterpret_cast<const uint8_t *>(this->mSource);
-    auto dst16 = reinterpret_cast<uint32_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-    for (int x = 0; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      int x1 = static_cast<int>(srcX);
-      int y1 = static_cast<int>(srcY);
-
-      int x2 = std::min(x1 + 1, this->inputWidth - 1);
-      int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-      float dx = static_cast<float>(srcX) - static_cast<float>(x1);
-      float dy = static_cast<float>(srcY) - static_cast<float>(y1);
-
-      auto row1 = reinterpret_cast<const uint32_t *>(src8 + y1 * this->srcStride);
-      auto row2 = reinterpret_cast<const uint32_t *>(src8 + y2 * this->srcStride);
-
-      auto c1 = static_cast<uint32_t>(row1[x1]);
-      auto c2 = static_cast<uint32_t>(row1[x2]);
-      auto c3 = static_cast<uint32_t>(row2[x1]);
-      auto c4 = static_cast<uint32_t>(row2[x2]);
-
-      float r1, g1, b1, a1;
-      float r2, g2, b2, a2;
-      float r3, g3, b3, a3;
-      float r4, g4, b4, a4;
-
-      parseToFloat(c1, r1, g1, b1, a1);
-      parseToFloat(c2, r2, g2, b2, a2);
-      parseToFloat(c3, r3, g3, b3, a3);
-      parseToFloat(c4, r4, g4, b4, a4);
-
-      float rInter = blerp(r1, r2, r3, r4, dx, dy);
-      float gInter = blerp(g1, g2, g3, g4, dx, dy);
-      float bInter = blerp(b1, b2, b3, b4, dx, dy);
-      float aInter = blerp(a1, a2, a3, a4, dx, dy);
-
-      auto R10 = static_cast<uint32_t >(std::clamp(::roundf(rInter * maxColors), 0.0f, (float) maxColors));
-      auto G10 = static_cast<uint32_t >(std::clamp(::roundf(gInter * maxColors), 0.0f, (float) maxColors));
-      auto B10 = static_cast<uint32_t >(std::clamp(::roundf(bInter * maxColors), 0.0f, (float) maxColors));
-      auto A10 = static_cast<uint32_t >(std::clamp(::roundf(aInter * 3.f), 0.0f, 3.0f));
-
-      dst16[0] = (A10 << 30) | (B10 << 20) | (G10 << 10) | R10;
-      dst16 += 1;
-    }
-  }
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 10.f) - 1.0f;
-
-  static inline void parseToFloat(const uint32_t rgba1010102, float &r, float &g, float &b, float &a) {
-    const uint32_t scalarMask = (1u << 10u) - 1u;
-    constexpr float colorsScale = 1.f / 1023.f;
-    constexpr float alphaScale = 1.f / 3.f;
-    uint32_t r1 = (rgba1010102) & scalarMask;
-    uint32_t g1 = (rgba1010102 >> 10) & scalarMask;
-    uint32_t b1 = (rgba1010102 >> 20) & scalarMask;
-    uint32_t a1 = (rgba1010102 >> 30) * 3;
-    float rFloat = static_cast<float>(r1) * colorsScale;
-    float gFloat = static_cast<float>(g1) * colorsScale;
-    float bFloat = static_cast<float>(b1) * colorsScale;
-    float aFloat = static_cast<float>(a1) * alphaScale;
-
-    r = rFloat;
-    g = gFloat;
-    b = bFloat;
-    a = aFloat;
-  }
-};
-
-} // sparkyuv
-HWY_AFTER_NAMESPACE();
-
-#undef BILINEAR_ENABLE_HWY
-
-#endif //SPARKYUV_BILINEAR_ROW_SAMPLER
diff --git a/src/sampler/BoxRowSampler-inl.h b/src/sampler/BoxRowSampler-inl.h
deleted file mode 100644
index 17d7a2e..0000000
--- a/src/sampler/BoxRowSampler-inl.h
+++ /dev/null
@@ -1,252 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(SPARKYUV_BOX_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE)
-#ifdef SPARKYUV_BOX_ROW_SAMPLER
-#undef SPARKYUV_BOX_ROW_SAMPLER
-#else
-#define SPARKYUV_BOX_ROW_SAMPLER
-#endif
-
-#include "hwy/highway.h"
-#include "ScaleRowSampler.hpp"
-#include "sampler-inl.h"
-#include "sampler.h"
-#include "../sparkyuv-internal.h"
-#include <cstdint>
-#include <algorithm>
-#include <cmath>
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-
-using namespace hwy;
-using namespace hwy::HWY_NAMESPACE;
-
-template<typename T, sparkyuv::BoxSamplerPixType PixelType, int Components>
-class BoxSampler : public ScaleRowSampler<T> {
- public:
-  BoxSampler(const T *mSource,
-             const int srcStride,
-             const int inputWidth,
-             const int inputHeight,
-             T *mDestination,
-             const int dstStride,
-             const int outputWidth,
-             const int outputHeight) :
-      ScaleRowSampler<T>(mSource,
-                         srcStride,
-                         inputWidth,
-                         inputHeight,
-                         mDestination,
-                         dstStride,
-                         outputWidth,
-                         outputHeight) {
-
-  }
-
-  ~BoxSampler() = default;
-
-  void sample(const int row) override {
-    auto dst = reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(this->mDestination) + row * this->dstStride);
-
-    auto src8 = reinterpret_cast<const uint8_t *>(this->mSource);
-
-    const int components = Components;
-#if SPARKYUV_ALLOW_FLOAT16
-    const FixedTag<hwy::float16_t, 8> df16;
-    const Half<decltype(df16)> dhf16;
-    const auto v16Scale = Set(dhf16, hwy::F16FromF32(1.f / 4.f));
-#endif
-
-    uint32_t x = 0;
-    if (PixelType == sparkyuv::BOX_UINT16 && components == 4) {
-      for (; x + 2 < this->outputWidth; ++x) {
-        auto srcX = static_cast<float>(x * this->xScale);
-        auto srcY = static_cast<float>(row * this->yScale);
-
-        const int x1 = static_cast<int>(::floorf(srcX));
-        const int y1 = static_cast<int>(::floorf(srcY));
-
-        const int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-        const FixedTag<uint16_t, 8> d;
-        const Half<decltype(d)> dh;
-        const RepartitionToWide<decltype(d)> d32;
-        const auto row1 = LoadU(d, &reinterpret_cast<const uint16_t *>(src8 + y1 * this->srcStride)[x1*4]);
-        const auto row2 = LoadU(d, &reinterpret_cast<const uint16_t *>(src8 + y2 * this->srcStride)[x1*4]);
-        const auto row1Upper = PromoteUpperTo(d32, row1);
-        const auto row2Upper = PromoteUpperTo(d32, row2);
-        const auto row1Lower = PromoteLowerTo(d32, row1);
-        const auto row2Lower = PromoteLowerTo(d32, row2);
-        const auto newWidePX = ShiftRight<2>(Add(Add(Add(row1Lower, row1Upper), row2Lower), row2Upper));
-        const auto newPX = DemoteTo(dh, newWidePX);
-        StoreU(newPX, dh, reinterpret_cast<uint16_t *>(dst));
-
-        dst += 4;
-      }
-    } else if (PixelType == sparkyuv::BOX_UINT8) {
-      for (; x + 2 < this->outputWidth; ++x) {
-        auto srcX = static_cast<float>(x * this->xScale);
-        auto srcY = static_cast<float>(row * this->yScale);
-
-        const int x1 = static_cast<int>(::floorf(srcX));
-        const int y1 = static_cast<int>(::floorf(srcY));
-
-        const int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-        const FixedTag<uint8_t, 8> d;
-        const Half<decltype(d)> dh;
-        const Rebind<uint16_t, decltype(d)> d16;
-        const RepartitionToWide<decltype(d16)> d32;
-        const auto row1 = LoadU(d, &reinterpret_cast<const uint8_t *>(src8 + y1 * this->srcStride)[x1*4]);
-        const auto row2 = LoadU(d, &reinterpret_cast<const uint8_t *>(src8 + y2 * this->srcStride)[x1*4]);
-        const auto sums = AddWide(d16, row1, row2);
-        const auto newWidePX = ShiftRightNarrow<2>(d32, SumsOf2(sums));
-        const auto newPX = DemoteTo(dh, newWidePX);
-        StoreU(newPX, dh, reinterpret_cast<uint8_t *>(dst));
-        dst += 4;
-      }
-    } else if (PixelType == sparkyuv::BOX_FLOAT16) {
-      for (; x + 2 < this->outputWidth; ++x) {
-        auto srcX = static_cast<float>(x * this->xScale);
-        auto srcY = static_cast<float>(row * this->yScale);
-
-        const int x1 = static_cast<int>(::floorf(srcX));
-        const int y1 = static_cast<int>(::floorf(srcY));
-
-        const int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-#if SPARKYUV_ALLOW_FLOAT16
-        const auto row1 = LoadU(df16, &reinterpret_cast<const hwy::float16_t *>(src8 + y1 * this->srcStride)[x1*4]);
-        const auto row2 = LoadU(df16, &reinterpret_cast<const hwy::float16_t *>(src8 + y2 * this->srcStride)[x1*4]);
-        const auto newWidePX = Mul(DemoteTo(df16, SumsOf2(Add(row1, row2))), v16Scale);
-        const auto newPX = newWidePX;
-        StoreU(newPX, dhf16, reinterpret_cast<hwy::float16_t *>(dst));
-#else
-        const FixedTag<uint16_t, 8> d;
-        const Half<decltype(d)> dh;
-        const RepartitionToWide<decltype(d)> d32;
-        const Rebind<float32_t, decltype(d32)> f32;
-        const Rebind<float16_t, decltype(f32)> f16;
-        const auto vScale = Set(f32, 1.f / 4.f);
-        const auto row1 = LoadU(d, &reinterpret_cast<const uint16_t *>(src8 + y1 * this->srcStride)[x1*4]);
-        const auto row2 = LoadU(d, &reinterpret_cast<const uint16_t *>(src8 + y2 * this->srcStride)[x1*4]);
-        const auto row1Upper = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteUpperTo(d32, row1))));
-        const auto row2Upper = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteUpperTo(d32, row2))));
-        const auto row1Lower = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteLowerTo(d32, row1))));
-        const auto row2Lower = PromoteTo(f32, DemoteTo(f16, ConvertTo(f32, PromoteLowerTo(d32, row2))));
-        const auto newWidePX = DemoteTo(f16, Mul(Add(Add(Add(row1Lower, row1Upper), row2Lower), row2Upper), vScale));
-        const auto newPX = BitCast(dh, newWidePX);
-        StoreU(newPX, dh, reinterpret_cast<uint16_t *>(dst));
-#endif
-
-        dst += 4;
-      }
-    }
-
-    for (; x < this->outputWidth; ++x) {
-      const float srcX = (float) x * this->xScale;
-      const float srcY = (float) row * this->yScale;
-
-      const int x1 = static_cast<int>(::floorf(srcX));
-      const int y1 = static_cast<int>(::floorf(srcY));
-
-      int x2 = std::min(x1 + 1, this->inputWidth - 1);
-      int y2 = std::min(y1 + 1, this->inputHeight - 1);
-
-      auto row1 = reinterpret_cast<const T *>(src8 + y1 * this->srcStride);
-      auto row2 = reinterpret_cast<const T *>(src8 + y2 * this->srcStride);
-
-      if (PixelType != sparkyuv::BOX_RGBA1010102 && PixelType != sparkyuv::BOX_FLOAT16) {
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int c = 0; c < components; ++c) {
-          auto c1 = static_cast<uint32_t>(row1[x1 * components + c]);
-          auto c2 = static_cast<uint32_t>(row1[x2 * components + c]);
-          auto c3 = static_cast<uint32_t>(row2[x1 * components + c]);
-          auto c4 = static_cast<uint32_t>(row2[x2 * components + c]);
-
-          uint32_t result = (c1 + c2 + c3 + c4) >> 2;
-          dst[0] = static_cast<T>(result);
-          dst += 1;
-        }
-      } else if (PixelType == sparkyuv::BOX_FLOAT16) {
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int c = 0; c < components; ++c) {
-          constexpr float scale = 1.f / 4.f;
-          auto c1 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x1 * components + c]));
-          auto c2 = hwy::F32FromF16(hwy::float16_t::FromBits(row1[x2 * components + c]));
-          auto c3 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x1 * components + c]));
-          auto c4 = hwy::F32FromF16(hwy::float16_t::FromBits(row2[x2 * components + c]));
-
-          float result = (c1 + c2 + c3 + c4) * scale;
-          dst[0] = static_cast<T>(hwy::F16FromF32(result).bits);
-          dst += 1;
-        }
-      } else if (PixelType == sparkyuv::BOX_RGBA1010102) {
-        uint32_t p1 = reinterpret_cast<const uint32_t *>(row1)[x1];
-        uint32_t p2 = reinterpret_cast<const uint32_t *>(row1)[x2];
-        uint32_t p3 = reinterpret_cast<const uint32_t *>(row2)[x1];
-        uint32_t p4 = reinterpret_cast<const uint32_t *>(row2)[x2];
-
-        uint32_t r1, g1, b1, a1;
-        uint32_t r2, g2, b2, a2;
-        uint32_t r3, g3, b3, a3;
-        uint32_t r4, g4, b4, a4;
-
-        sparse1010102(p1, r1, g1, b1, a1);
-        sparse1010102(p2, r2, g2, b2, a2);
-        sparse1010102(p3, r3, g3, b3, a3);
-        sparse1010102(p4, r4, g4, b4, a4);
-
-        uint32_t r = (r1 + r2 + r3 + r4) >> 2;
-        uint32_t g = (g1 + g2 + g3 + g4) >> 2;
-        uint32_t b = (b1 + b2 + b3 + b4) >> 2;
-        uint32_t a = (a1 + a2 + a3 + a4) >> 2;
-
-        reinterpret_cast<uint32_t *>(dst)[0] = (a << 30) | (b << 20) | (g << 10) | r;
-
-        if (std::is_same<T, uint8_t>::value) {
-          dst += 4;
-        } else if (std::is_same<T, uint32_t>::value) {
-          dst += 1;
-        }
-      }
-    }
-  }
-
-  inline void sparse1010102(const uint32_t rgba1010102, uint32_t &r, uint32_t &g, uint32_t &b, uint32_t &a) {
-    constexpr uint32_t scalarMask = (1u << 10u) - 1u;
-    uint32_t r1 = (rgba1010102) & scalarMask;
-    uint32_t g1 = (rgba1010102 >> 10) & scalarMask;
-    uint32_t b1 = (rgba1010102 >> 20) & scalarMask;
-    uint32_t a1 = (rgba1010102 >> 30) * 3;
-
-    r = r1;
-    g = g1;
-    b = b1;
-    a = a1;
-  }
-};
-
-}
-HWY_AFTER_NAMESPACE();
-
-#endif
\ No newline at end of file
diff --git a/src/sampler/NearestRowSampler-inl.hpp b/src/sampler/NearestRowSampler-inl.hpp
deleted file mode 100644
index 8ae214b..0000000
--- a/src/sampler/NearestRowSampler-inl.hpp
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(SPARKYUV_NEAREST_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE)
-#ifdef SPARKYUV_NEAREST_ROW_SAMPLER
-#undef SPARKYUV_NEAREST_ROW_SAMPLER
-#else
-#define SPARKYUV_NEAREST_ROW_SAMPLER
-#endif
-
-#include "hwy/highway.h"
-#include "ScaleRowSampler.hpp"
-#include "sampler.h"
-#include <cstdint>
-#include <algorithm>
-#include <cmath>
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-
-using namespace sparkyuv;
-
-template<int components>
-class NearestRowSampler : public ScaleRowSampler<uint8_t> {
- public:
-  NearestRowSampler(const uint8_t *mSource,
-                    const int srcStride,
-                    const int inputWidth,
-                    const int inputHeight,
-                    uint8_t *mDestination,
-                    const int dstStride,
-                    const int outputWidth,
-                    const int outputHeight) :
-      ScaleRowSampler<uint8_t>(mSource,
-                               srcStride,
-                               inputWidth,
-                               inputHeight,
-                               mDestination,
-                               dstStride,
-                               outputWidth,
-                               outputHeight) {
-
-  }
-
-  void sample(const int row) override {
-    auto dst = reinterpret_cast<uint8_t *>(this->mDestination + row * this->dstStride);
-    if (components == 4) {
-      for (uint32_t x = 0; x < this->outputWidth; ++x) {
-        auto srcX = static_cast<float>(x * this->xScale);
-        auto srcY = static_cast<float>(row * this->yScale);
-
-        const int x1 = std::clamp(static_cast<int>(::floorf(srcX)), 0, this->inputWidth - 1);
-        const int y1 = std::clamp(static_cast<int>(::floorf(srcY)), 0, this->inputHeight - 1);
-        auto srcRow = reinterpret_cast<const uint8_t *>(this->mSource + y1 * this->srcStride);
-        uint32_t px = reinterpret_cast<const uint32_t *>(srcRow)[x1];
-        reinterpret_cast<uint32_t *>(dst)[x] = px;
-      }
-    } else {
-      for (uint32_t x = 0; x < this->outputWidth; ++x) {
-        auto srcX = static_cast<float>(x * this->xScale);
-        auto srcY = static_cast<float>(row * this->yScale);
-
-        const int x1 = std::clamp(static_cast<int>(::floorf(srcX)), 0, this->inputWidth - 1);
-        const int y1 = std::clamp(static_cast<int>(::floorf(srcY)), 0, this->inputHeight - 1);
-        auto srcRow = reinterpret_cast<const uint8_t *>(this->mSource + y1 * this->srcStride);
-        auto srcPtr = &srcRow[x1 * components];
-        std::copy(srcPtr, srcPtr + sizeof(uint8_t) * components, &dst[x * components]);
-      }
-    }
-  }
-
-  ~NearestRowSampler() override = default;
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f;
-};
-
-template<int Components>
-class NearestRowSampler16Bit : public ScaleRowSampler<uint16_t> {
- public:
-  NearestRowSampler16Bit(const uint16_t *mSource,
-                         const int srcStride,
-                         const int inputWidth,
-                         const int inputHeight,
-                         uint16_t *mDestination,
-                         const int dstStride,
-                         const int outputWidth,
-                         const int outputHeight) :
-      ScaleRowSampler<uint16_t>(mSource,
-                                srcStride,
-                                inputWidth,
-                                inputHeight,
-                                mDestination,
-                                dstStride,
-                                outputWidth,
-                                outputHeight) {
-
-  }
-
-  void sample(const int row) override {
-    const int components = Components;
-    auto dst = reinterpret_cast<uint16_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + row * this->dstStride);
-    for (int x = 0; x < this->outputWidth; ++x) {
-      const float srcX = (float) x * this->xScale;
-      const float srcY = (float) row * this->yScale;
-
-      const int x1 = std::clamp(static_cast<int>(::floorf(srcX)), 0, this->inputWidth - 1);
-      const int y1 = std::clamp(static_cast<int>(::floorf(srcY)), 0, this->inputHeight - 1);
-      auto srcRow =
-          reinterpret_cast<const uint16_t *>(reinterpret_cast<const uint8_t *>(this->mSource) + y1 * this->srcStride);
-      auto srcPtr = &srcRow[x1 * components];
-      std::copy(srcPtr, srcPtr + sizeof(uint8_t) * components, &dst[x * components]);
-    }
-  }
-
-  ~NearestRowSampler16Bit() override = default;
-
-};
-
-class NearestRowSampler10Bit : public ScaleRowSampler<uint32_t> {
- public:
-  NearestRowSampler10Bit(const uint32_t *mSource,
-                         const int srcStride,
-                         const int inputWidth,
-                         const int inputHeight,
-                         uint32_t *mDestination,
-                         const int dstStride,
-                         const int outputWidth,
-                         const int outputHeight) :
-      ScaleRowSampler<uint32_t>(mSource,
-                                srcStride,
-                                inputWidth,
-                                inputHeight,
-                                mDestination,
-                                dstStride,
-                                outputWidth,
-                                outputHeight) {
-
-  }
-
-  void sample(const int row) override {
-    auto dst = reinterpret_cast<uint32_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + row * this->dstStride);
-    for (int x = 0; x < this->outputWidth; ++x) {
-      const float srcX = (float) x * xScale;
-      const float srcY = (float) row * yScale;
-
-      const int x1 = std::clamp(static_cast<int>(::floorf(srcX)), 0, inputWidth - 1);
-      const int y1 = std::clamp(static_cast<int>(::floorf(srcY)), 0, inputHeight - 1);
-      auto srcRow = reinterpret_cast<const uint32_t *>(reinterpret_cast<const uint8_t *>(mSource) + y1 * srcStride);
-      dst[x] = srcRow[x1];
-    }
-  }
-
-  ~NearestRowSampler10Bit() override = default;
-
- private:
-};
-
-}
-HWY_AFTER_NAMESPACE();
-
-#endif
diff --git a/src/sampler/ScaleRowSampler.hpp b/src/sampler/ScaleRowSampler.hpp
deleted file mode 100644
index 46cc295..0000000
--- a/src/sampler/ScaleRowSampler.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace sparkyuv {
-
-typedef float (*ScaleWeightSampler)(float);
-
-template<typename T>
-class ScaleRowSampler {
- public:
-  ScaleRowSampler(const T *mSource,
-                  const int srcStride,
-                  const int inputWidth,
-                  const int inputHeight,
-                  T *mDestination,
-                  const int dstStride,
-                  const int outputWidth,
-                  const int outputHeight) : mSource(mSource),
-                                            srcStride(srcStride),
-                                            inputWidth(inputWidth),
-                                            inputHeight(inputHeight),
-                                            mDestination(mDestination),
-                                            dstStride(dstStride),
-                                            outputWidth(outputWidth),
-                                            outputHeight(outputHeight) {
-    xScale = static_cast<float>(inputWidth) / static_cast<float>(outputWidth);
-    yScale = static_cast<float>(inputHeight) / static_cast<float>(outputHeight);
-  }
-
-  virtual void sample(int row) = 0;
-
-  virtual ~ScaleRowSampler() = default;
-
- public:
-  const T *mSource;
-  const int srcStride;
-  const int inputWidth;
-  const int inputHeight;
-  T *mDestination;
-  const int dstStride;
-  const int outputWidth;
-  const int outputHeight;
-
-  float xScale;
-  float yScale;
-};
-}
\ No newline at end of file
diff --git a/src/sampler/Window4RowSampler-inl.hpp b/src/sampler/Window4RowSampler-inl.hpp
deleted file mode 100644
index 7307bfb..0000000
--- a/src/sampler/Window4RowSampler-inl.hpp
+++ /dev/null
@@ -1,686 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(SPARKYUV_WINDOW4_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE)
-#ifdef SPARKYUV_WINDOW4_ROW_SAMPLER
-#undef SPARKYUV_WINDOW4_ROW_SAMPLER
-#else
-#define SPARKYUV_WINDOW4_ROW_SAMPLER
-#endif
-
-#include <hwy/highway.h>
-#include <cstdint>
-#include <algorithm>
-#include "ScaleRowSampler.hpp"
-#include "src/sampler/sampler.h"
-#include <cmath>
-
-#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
-#define WEIGHTED_WINDOW4_HWY 1
-#else
-#define WEIGHTED_WINDOW4_HWY 0
-#endif
-
-#if WEIGHTED_WINDOW4_HWY
-#include "src/sampler/sampler-inl.h"
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-
-using namespace hwy;
-using namespace hwy::HWY_NAMESPACE;
-using namespace sparkyuv;
-
-enum WeightedRow4Operation {
-  WEIGHTED_ROW4_HERMITE,
-  WEIGHTED_ROW4_CATMULL_ROM,
-  WEIGHTED_ROW4_BSPLINE,
-  WEIGHTED_ROW4_CUBIC,
-  WEIGHTED_ROW4_BICUBIC,
-  WEIGHTED_ROW4_MITCHELL
-};
-
-template<WeightedRow4Operation op, int Components>
-class WeightedWindow4RowSampler : public ScaleRowSampler<uint8_t> {
- public:
-  WeightedWindow4RowSampler(const uint8_t *mSource,
-                            const int srcStride,
-                            const int inputWidth,
-                            const int inputHeight,
-                            uint8_t *mDestination,
-                            const int dstStride,
-                            const int outputWidth,
-                            const int outputHeight) :
-      ScaleRowSampler<uint8_t>(mSource, srcStride,
-                               inputWidth, inputHeight,
-                               mDestination, dstStride,
-                               outputWidth, outputHeight) {
-    switch (op) {
-      case WEIGHTED_ROW4_HERMITE: {
-        sampler = CubicHermite;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = CubicHermiteV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_CATMULL_ROM: {
-        sampler = CatmullRom;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = CatmullRomV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_BSPLINE: {
-        sampler = BSpline;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = CubicBSplineV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_CUBIC: {
-        sampler = SimpleCubic;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = SimpleCubicV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_BICUBIC: {
-        sampler = BiCubicSpline;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = BiCubicSplineV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_MITCHELL: {
-        sampler = MitchellNetravalli;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = MitchellNetravaliV;
-#endif
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-#if WEIGHTED_WINDOW4_HWY
-    const FixedTag<float32_t, 4> dfx4;
-    const FixedTag<int32_t, 4> dix4;
-    const FixedTag<uint32_t, 4> dux4;
-    const FixedTag<uint8_t, 4> du8x4;
-    using VI4 = Vec<decltype(dix4)>;
-    using VF4 = Vec<decltype(dfx4)>;
-    using VU8x4 = Vec<decltype(du8x4)>;
-    const VF4 vfZeros = Zero(dfx4);
-    const VF4 maxColorsV = Set(dfx4, maxColors);
-#endif
-
-    auto dst = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    const int components = Components;
-
-    uint32_t x = 0;
-
-#if WEIGHTED_WINDOW4_HWY
-#if !NOACCELERATED_SAMPLER
-    for (; x + 8 < this->outputWidth && components == 4; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      // only kernel with size 2 is supported
-      constexpr int kernelSize = 2;
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      VF4 color = Set(dfx4, 0);
-
-      const int a = kernelSize;
-      const int mMaxWidth = this->inputWidth - 1;
-
-      const int appendixLow[4] = {-1, 0, 1, 2};
-
-      VF4 srcXV = Set(dfx4, srcX);
-      VI4 kx1V = Set(dix4, kx1);
-      const VI4 appendixLowV = LoadU(dix4, appendixLow);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        auto row = reinterpret_cast<const uint8_t *>(this->mSource
-            + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-        VF4 yWeightV = Set(dfx4, yWeight);
-        VI4 xi = Add(kx1V, appendixLowV);
-        VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi));
-        VF4 weights = Mul(samplerHWY(dfx4, dx), yWeightV);
-        for (int i = 0; i < components; ++i) {
-          int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components;
-          VU8x4 u81 = LoadU(du8x4, reinterpret_cast<const uint8_t *>(&row[sizeXPos]));
-          VF4 fr1 = ConvertTo(dfx4, PromoteTo(dix4, u81));
-          fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i)));
-          color = Add(color, fr1);
-        }
-      }
-
-      color = ClampRound(dfx4, color, vfZeros, maxColorsV);
-      VU8x4 u8Color = DemoteTo(du8x4, ConvertTo(dux4, color));
-      StoreU(u8Color, du8x4, reinterpret_cast<uint8_t *>(&dst[x * components]));
-    }
-#endif
-#endif
-
-    for (; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      int a = 2;
-
-      float rgb[components];
-      std::fill(rgb, rgb + components, 0.0f);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-
-        auto row = reinterpret_cast<const uint8_t *>(this->mSource
-            + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1) * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-          for (int c = 0; c < components; ++c) {
-            auto clrf = static_cast<float>(row[px + c]);
-            float clr = clrf * weight;
-            rgb[c] += clr;
-          }
-        }
-      }
-
-      const int px = x * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int c = 0; c < components; ++c) {
-        dst[px + c] = static_cast<uint8_t>(std::clamp(::roundf(rgb[c]), 0.f, maxColors));
-      }
-    }
-  }
-
-  ~WeightedWindow4RowSampler() override = default;
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f;
-  ScaleWeightSampler sampler;
-
-#if WEIGHTED_WINDOW4_HWY
-  typedef Vec<FixedTag<float32_t, 4>> (*ScaleWeightSamplerHWY)(FixedTag<float32_t, 4>, Vec<FixedTag<float32_t, 4>>);
-  ScaleWeightSamplerHWY samplerHWY;
-#endif
-};
-
-template<WeightedRow4Operation op, int Components>
-class WeightedWindow4RowSampler16Bit : public ScaleRowSampler<uint16_t> {
- public:
-  WeightedWindow4RowSampler16Bit(const uint16_t *mSource,
-                                 const int srcStride,
-                                 const int inputWidth,
-                                 const int inputHeight,
-                                 uint16_t *mDestination,
-                                 const int dstStride,
-                                 const int outputWidth,
-                                 const int outputHeight,
-                                 const int depth) :
-      ScaleRowSampler<uint16_t>(mSource, srcStride,
-                                inputWidth, inputHeight,
-                                mDestination, dstStride,
-                                outputWidth, outputHeight),
-      maxColors(::powf(2.0f, static_cast<float>(depth)) - 1.0f) {
-    switch (op) {
-      case WEIGHTED_ROW4_HERMITE: {
-        sampler = CubicHermite;
-      }
-        break;
-      case WEIGHTED_ROW4_CATMULL_ROM: {
-        sampler = CatmullRom;
-      }
-        break;
-      case WEIGHTED_ROW4_BSPLINE: {
-        sampler = BSpline;
-      }
-        break;
-      case WEIGHTED_ROW4_CUBIC: {
-        sampler = SimpleCubic;
-      }
-        break;
-      case WEIGHTED_ROW4_BICUBIC: {
-        sampler = BiCubicSpline;
-      }
-        break;
-      case WEIGHTED_ROW4_MITCHELL: {
-        sampler = MitchellNetravalli;
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-
-    auto dst = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    const int components = Components;
-
-    for (int x = 0; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      int a = 2;
-
-      float rgb[components];
-      std::fill(rgb, rgb + components, 0.0f);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-
-        auto row = reinterpret_cast<const uint16_t *>(reinterpret_cast<const uint8_t *>(this->mSource)
-            + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1) * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-          for (int c = 0; c < components; ++c) {
-            auto clrf = static_cast<float>(row[px + c]);
-            float clr = clrf * weight;
-            rgb[c] += clr;
-          }
-        }
-      }
-
-      const int px = x * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int c = 0; c < components; ++c) {
-        dst[px + c] = static_cast<uint16_t>(std::clamp(::roundf(rgb[c]), 0.f, maxColors));
-      }
-    }
-  }
-
-  ~WeightedWindow4RowSampler16Bit() override = default;
-
- private:
-  const float maxColors;
-  ScaleWeightSampler sampler;
-};
-
-template<WeightedRow4Operation op>
-class WeightedWindow4RowSampler10Bit : public ScaleRowSampler<uint32_t> {
- public:
-  WeightedWindow4RowSampler10Bit(const uint32_t *mSource, const int srcStride,
-                                 const int inputWidth, const int inputHeight,
-                                 uint32_t *mDestination, const int dstStride,
-                                 const int outputWidth, const int outputHeight) :
-      ScaleRowSampler<uint32_t>(mSource, srcStride,
-                                inputWidth, inputHeight,
-                                mDestination, dstStride,
-                                outputWidth, outputHeight) {
-    switch (op) {
-      case WEIGHTED_ROW4_HERMITE: {
-        sampler = CubicHermite;
-      }
-        break;
-      case WEIGHTED_ROW4_CATMULL_ROM: {
-        sampler = CatmullRom;
-      }
-        break;
-      case WEIGHTED_ROW4_BSPLINE: {
-        sampler = BSpline;
-      }
-        break;
-      case WEIGHTED_ROW4_CUBIC: {
-        sampler = SimpleCubic;
-      }
-        break;
-      case WEIGHTED_ROW4_BICUBIC: {
-        sampler = BiCubicSpline;
-      }
-        break;
-      case WEIGHTED_ROW4_MITCHELL: {
-        sampler = MitchellNetravalli;
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-    auto dst = reinterpret_cast<uint32_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    for (int x = 0; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      const int a = 2;
-
-      float rgb[4] = {0, 0, 0, 0};
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-
-        auto row = reinterpret_cast<const uint32_t *>(reinterpret_cast<const uint8_t *>(this->mSource) +
-            std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1);
-
-          uint32_t color = row[px];
-
-          float r = 0, g = 0, b = 0, aAlpha = 0;
-          parseToFloat(color, r, g, b, aAlpha);
-
-          rgb[0] += r * weight;
-          rgb[1] += g * weight;
-          rgb[2] += b * weight;
-          rgb[3] += aAlpha * weight;
-        }
-      }
-
-      auto R10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[0] * maxColors), 0.0f, (float) maxColors));
-      auto G10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[1] * maxColors), 0.0f, (float) maxColors));
-      auto B10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[2] * maxColors), 0.0f, (float) maxColors));
-      auto A10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[3] * 3.f), 0.0f, 3.0f));
-
-      dst[x] = (A10 << 30) | (B10 << 20) | (G10 << 10) | R10;
-    }
-  }
-
-  ~WeightedWindow4RowSampler10Bit() override = default;
-
- private:
-
-  const float maxColors = ::powf(2.0f, (float) 10.f) - 1.0f;
-  ScaleWeightSampler sampler;
-
-  inline void parseToFloat(const uint32_t rgba1010102, float &r, float &g, float &b, float &a) {
-    const uint32_t scalarMask = (1u << 10u) - 1u;
-    uint32_t b1 = (rgba1010102) & scalarMask;
-    uint32_t g1 = (rgba1010102 >> 10) & scalarMask;
-    uint32_t r1 = (rgba1010102 >> 20) & scalarMask;
-    uint32_t a1 = (rgba1010102 >> 30) * 3;
-    constexpr float colorScale = 1.f / 1023.f;
-    constexpr float alphaScale = 1.f / 3.f;
-    float rFloat = static_cast<float>(r1) * colorScale;
-    float gFloat = static_cast<float>(g1) * colorScale;
-    float bFloat = static_cast<float>(b1) * colorScale;
-    float aFloat = static_cast<float>(a1) * alphaScale;
-
-    r = rFloat;
-    g = gFloat;
-    b = bFloat;
-    a = aFloat;
-  }
-};
-
-template<WeightedRow4Operation op, int Components>
-class WeightedWindow4RowSamplerF16Bit : public ScaleRowSampler<uint16_t> {
- public:
-  WeightedWindow4RowSamplerF16Bit(const uint16_t *mSource,
-                                  const int srcStride,
-                                  const int inputWidth,
-                                  const int inputHeight,
-                                  uint16_t *mDestination,
-                                  const int dstStride,
-                                  const int outputWidth,
-                                  const int outputHeight) :
-      ScaleRowSampler<uint16_t>(mSource, srcStride,
-                                inputWidth, inputHeight,
-                                mDestination, dstStride,
-                                outputWidth, outputHeight) {
-    switch (op) {
-      case WEIGHTED_ROW4_HERMITE: {
-        sampler = CubicHermite;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = CubicHermiteV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_CATMULL_ROM: {
-        sampler = CatmullRom;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = CatmullRomV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_BSPLINE: {
-        sampler = BSpline;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = CubicBSplineV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_CUBIC: {
-        sampler = SimpleCubic;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = SimpleCubicV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_BICUBIC: {
-        sampler = BiCubicSpline;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = BiCubicSplineV;
-#endif
-      }
-        break;
-      case WEIGHTED_ROW4_MITCHELL: {
-        sampler = MitchellNetravalli;
-#if WEIGHTED_WINDOW4_HWY
-        samplerHWY = MitchellNetravaliV;
-#endif
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-#if WEIGHTED_WINDOW4_HWY
-    const FixedTag<float32_t, 4> dfx4;
-    const FixedTag<int32_t, 4> dix4;
-    const FixedTag<hwy::float16_t, 4> df16x4;
-    using VI4 = Vec<decltype(dix4)>;
-    using VF4 = Vec<decltype(dfx4)>;
-    using VF16x4 = Vec<decltype(df16x4)>;
-    const int mMaxWidth = this->inputWidth - 1;
-#endif
-
-    const auto src8 = reinterpret_cast<const uint8_t *>(this->mSource);
-    auto dst16 = reinterpret_cast<uint16_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    const int components = Components;
-
-    uint32_t x = 0;
-
-#if WEIGHTED_WINDOW4_HWY
-#if !NOACCELERATED_SAMPLER
-    for (; x + 8 < this->outputWidth && components == 4; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-      const int a = 2;
-      float rgb[components];
-      fill(rgb, rgb + components, 0.0f);
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      VF4 color = Set(dfx4, 0);
-
-      const int appendixLow[4] = {-1, 0, 1, 2};
-
-      VF4 srcXV = Set(dfx4, srcX);
-      VI4 kx1V = Set(dix4, kx1);
-      const VI4 appendixLowV = LoadU(dix4, appendixLow);
-
-      #if defined(__clang__)
-      #pragma clang loop unroll(full)
-      #endif
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        auto row =
-            reinterpret_cast<const uint16_t *>(src8 + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-        VF4 yWeightV = Set(dfx4, yWeight);
-        VI4 xi = Add(kx1V, appendixLowV);
-        VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi));
-        VF4 weights = Mul(samplerHWY(dfx4, dx), yWeightV);
-
-        #if defined(__clang__)
-        #pragma clang loop unroll(full)
-        #endif
-        for (int i = 0; i < 4; ++i) {
-          int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components;
-          VF16x4 r1 = LoadU(df16x4, reinterpret_cast<const hwy::float16_t *>(&row[sizeXPos]));
-          VF4 fr1 = PromoteTo(dfx4, r1);
-          fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i)));
-          color = Add(color, fr1);
-        }
-      }
-
-      VF16x4 f16Color = DemoteTo(df16x4, color);
-      StoreU(f16Color, df16x4, reinterpret_cast<hwy::float16_t *>(&dst16[x * components]));
-    }
-#endif
-#endif
-
-    for (; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      const int a = 2;
-      float rgb[components];
-      std::fill(rgb, rgb + components, 0.0f);
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-
-          auto *row = reinterpret_cast<const uint16_t *>(reinterpret_cast<const uint8_t *>(src8) +
-              std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1) * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-          for (int c = 0; c < components; ++c) {
-            float clrf = hwy::F32FromF16(hwy::float16_t::FromBits(row[px + c]));
-            float clr = (float) clrf * weight;
-            rgb[c] += clr;
-          }
-        }
-      }
-
-      int px = x * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int c = 0; c < components; ++c) {
-        float newColor = rgb[c];
-        dst16[px + c] = hwy::F16FromF32(newColor).bits;
-      }
-    }
-  }
-
-  ~WeightedWindow4RowSamplerF16Bit() override = default;
-
- private:
-  ScaleWeightSampler sampler;
-#if WEIGHTED_WINDOW4_HWY
-  typedef Vec<FixedTag<float32_t, 4>> (*ScaleWeightSamplerHWY)(FixedTag<float32_t, 4>, Vec<FixedTag<float32_t, 4>>);
-  ScaleWeightSamplerHWY samplerHWY;
-#endif
-};
-
-}
-HWY_AFTER_NAMESPACE();
-
-#undef WEIGHTED_WINDOW4_HWY
-
-#endif
diff --git a/src/sampler/Window6RowSampler-inl.hpp b/src/sampler/Window6RowSampler-inl.hpp
deleted file mode 100644
index eff5a6f..0000000
--- a/src/sampler/Window6RowSampler-inl.hpp
+++ /dev/null
@@ -1,613 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#if defined(SPARKYUV_WINDOW6_ROW_SAMPLER) == defined(HWY_TARGET_TOGGLE)
-#ifdef SPARKYUV_WINDOW6_ROW_SAMPLER
-#undef SPARKYUV_WINDOW6_ROW_SAMPLER
-#else
-#define SPARKYUV_WINDOW6_ROW_SAMPLER
-#endif
-
-#include <hwy/highway.h>
-#include "ScaleRowSampler.hpp"
-#include <cstdint>
-#include <algorithm>
-#include "sampler.h"
-#include "src/math/math-inl.h"
-#include <cmath>
-
-#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
-#define WEIGHTED_WINDOW6_HWY 1
-#else
-#define WEIGHTED_WINDOW6_HWY 0
-#endif
-
-#if WEIGHTED_WINDOW6_HWY
-#include "sampler-inl.h"
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-using namespace hwy;
-using namespace hwy::HWY_NAMESPACE;
-using namespace sparkyuv;
-
-enum WeightedRow6Operation {
-  WEIGHTED_ROW6_LANCZOS_SINC
-};
-
-template<WeightedRow6Operation op>
-class WeightedWindow6RowSampler10Bit : public ScaleRowSampler<uint32_t> {
- public:
-  WeightedWindow6RowSampler10Bit(const uint32_t *mSource,
-                                 const int srcStride,
-                                 const int inputWidth,
-                                 const int inputHeight,
-                                 uint32_t *mDestination,
-                                 const int dstStride,
-                                 const int outputWidth,
-                                 const int outputHeight) :
-      ScaleRowSampler<uint32_t>(mSource,
-                                srcStride,
-                                inputWidth,
-                                inputHeight,
-                                mDestination,
-                                dstStride,
-                                outputWidth,
-                                outputHeight) {
-    switch (op) {
-      case WEIGHTED_ROW6_LANCZOS_SINC: {
-        sampler = sparkyuv::Lanczos3Sinc;
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-    auto dst = reinterpret_cast<uint32_t *>(reinterpret_cast<uint8_t * >(this->mDestination) + y * this->dstStride);
-
-    for (int x = 0; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      const int a = 3;
-
-      float rgb[4] = {0, 0, 0, 0};
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      float weightSum(0.0f);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-          weightSum += weight;
-
-          auto row = reinterpret_cast<const uint32_t *>(reinterpret_cast<const uint8_t *>(this->mSource)
-              + clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1);
-
-          uint32_t color = row[px];
-
-          float r = 0, g = 0, b = 0, aAlpha = 0;
-          parseToFloat(color, r, g, b, aAlpha);
-
-          rgb[0] += r * weight;
-          rgb[1] += g * weight;
-          rgb[2] += b * weight;
-          rgb[3] += aAlpha * weight;
-        }
-      }
-
-      const int px = x;
-
-      if (weightSum == 0.f) {
-        dst[px] = 0;
-      } else {
-        float revertScale = 1.f / weightSum * maxColors;
-        auto R10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[0] * revertScale), 0.0f, (float) maxColors));
-        auto G10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[1] * revertScale), 0.0f, (float) maxColors));
-        auto B10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[2] * revertScale), 0.0f, (float) maxColors));
-        auto A10 = static_cast<uint32_t >(std::clamp(::roundf(rgb[3] / weightSum * 3.f), 0.0f, 3.0f));
-
-        dst[x] = (A10 << 30) | (B10 << 20) | (G10 << 10) | R10;
-      }
-    }
-  }
-
-  ~WeightedWindow6RowSampler10Bit() override = default;
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 10.f) - 1.0f;
-  ScaleWeightSampler sampler;
-
-  inline void parseToFloat(const uint32_t rgba1010102, float &r, float &g, float &b, float &a) {
-    const uint32_t scalarMask = (1u << 10u) - 1u;
-    uint32_t b1 = (rgba1010102) & scalarMask;
-    uint32_t g1 = (rgba1010102 >> 10) & scalarMask;
-    uint32_t r1 = (rgba1010102 >> 20) & scalarMask;
-    uint32_t a1 = (rgba1010102 >> 30) * 3;
-    constexpr float colorScale = 1.f / 1023.f;
-    constexpr float alphaScale = 1.f / 3.f;
-    float rFloat = static_cast<float>(r1) * colorScale;
-    float gFloat = static_cast<float>(g1) * colorScale;
-    float bFloat = static_cast<float>(b1) * colorScale;
-    float aFloat = static_cast<float>(a1) * alphaScale;
-
-    r = rFloat;
-    g = gFloat;
-    b = bFloat;
-    a = aFloat;
-  }
-};
-
-template<WeightedRow6Operation op, int Components>
-class WeightedWindow6RowSampler : public ScaleRowSampler<uint8_t> {
- public:
-  WeightedWindow6RowSampler(const uint8_t *mSource, const int srcStride,
-                            const int inputWidth, const int inputHeight,
-                            uint8_t *mDestination, const int dstStride,
-                            const int outputWidth, const int outputHeight) :
-      ScaleRowSampler<uint8_t>(mSource,
-                               srcStride,
-                               inputWidth, inputHeight,
-                               mDestination, dstStride,
-                               outputWidth, outputHeight) {
-    switch (op) {
-      case WEIGHTED_ROW6_LANCZOS_SINC: {
-        sampler = sparkyuv::Lanczos3Sinc;
-#if WEIGHTED_WINDOW6_HWY
-        samplerHWY = Lanczos3Sinc;
-#endif
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-    auto dst = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    const int components = Components;
-
-#if WEIGHTED_WINDOW6_HWY
-    const FixedTag<float32_t, 4> dfx4;
-    const FixedTag<int32_t, 4> dix4;
-    const FixedTag<uint32_t, 4> dux4;
-    const FixedTag<uint8_t, 4> du8x4;
-    using VI4 = Vec<decltype(dix4)>;
-    using VF4 = Vec<decltype(dfx4)>;
-    using VU8x4 = Vec<decltype(du8x4)>;
-
-    const VF4 vfZeros = Zero(dfx4);
-    const VF4 maxColorsV = Set(dfx4, maxColors);
-#endif
-
-    uint32_t x = 0;
-
-#if WEIGHTED_WINDOW6_HWY
-#if !NOACCELERATED_SAMPLER
-    for (; x + 8 < this->outputWidth && components == 4; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      // only kernel with size 3 is supported
-      constexpr int kernelSize = 3;
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      float kWeightSum = 0;
-      VF4 color = Set(dfx4, 0);
-
-      const int a = kernelSize;
-      const int mMaxWidth = this->inputWidth - 1;
-
-      const int appendixLow[4] = {-2, -1, 0, 1};
-      const int appendixHigh[4] = {2, 3, 0, 0};
-
-      VF4 srcXV = Set(dfx4, srcX);
-      VI4 kx1V = Set(dix4, kx1);
-      const VI4 appendixLowV = LoadU(dix4, appendixLow);
-      const VI4 appendixHighV = LoadU(dix4, appendixHigh);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        auto row = reinterpret_cast<const uint8_t *>(this->mSource
-            + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-        VF4 yWeightV = Set(dfx4, yWeight);
-        VI4 xi = Add(kx1V, appendixLowV);
-        VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi));
-        VF4 sampleParameter = dx;
-        VF4 weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV);
-        kWeightSum += ExtractLane(SumOfLanes(dfx4, weights), 0);
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int i = 0; i < 4; ++i) {
-          int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components;
-          VU8x4 u81 = LoadU(du8x4, reinterpret_cast<const uint8_t *>(&row[sizeXPos]));
-          VF4 fr1 = ConvertTo(dfx4, PromoteTo(dix4, u81));
-          fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i)));
-          color = Add(color, fr1);
-        }
-
-        xi = Add(kx1V, appendixHighV);
-        dx = Sub(srcXV, ConvertTo(dfx4, xi));
-        sampleParameter = dx;
-        weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV);
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-        for (int i = 0; i < 2; ++i) {
-          int sizeXPos = std::clamp(ExtractLane(xi, i), 0, mMaxWidth) * components;
-          VU8x4 u81 = LoadU(du8x4, reinterpret_cast<const uint8_t *>(&row[sizeXPos]));
-          VF4 fr1 = ConvertTo(dfx4, PromoteTo(dix4, u81));
-          float weight = ExtractLane(weights, i);
-          kWeightSum += weight;
-          fr1 = Mul(fr1, Set(dfx4, weight));
-          color = Add(color, fr1);
-        }
-      }
-
-      if (kWeightSum == 0) {
-        color = ClampRound(dfx4, color, vfZeros, maxColorsV);
-        VU8x4 u8Color = DemoteTo(du8x4, ConvertTo(dux4, color));
-        StoreU(u8Color, du8x4, reinterpret_cast<uint8_t *>(&dst[x * components]));
-      } else {
-        color = ClampRound(dfx4, Div(color, Set(dfx4, kWeightSum)), vfZeros,
-                           maxColorsV);
-        VU8x4 u8Color = DemoteTo(du8x4, ConvertTo(dux4, color));
-        StoreU(u8Color, du8x4, reinterpret_cast<uint8_t *>(&dst[x * components]));
-      }
-    }
-#endif
-#endif
-
-    for (; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      const int a = 3;
-      float rgb[components];
-      std::fill(rgb, rgb + components, 0.0f);
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      float weightSum(0.0f);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-          weightSum += weight;
-
-          auto row = reinterpret_cast<const uint8_t *>(this->mSource
-              + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1) * components;
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-          for (int c = 0; c < components; ++c) {
-            auto clrf = static_cast<float>(row[px + c]);
-            float clr = clrf * weight;
-            rgb[c] += clr;
-          }
-        }
-      }
-
-      const int px = x * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int c = 0; c < components; ++c) {
-        dst[px + c] = static_cast<uint8_t>(std::clamp(::roundf(rgb[c] * weightSum), 0.0f, maxColors));
-      }
-    }
-  }
-
-  ~WeightedWindow6RowSampler() override = default;
-
- private:
-  typedef Vec<FixedTag<float32_t, 4>> (*ScaleWeightSamplerHWY)(FixedTag<float32_t, 4>, Vec<FixedTag<float32_t, 4>>);
-
-  const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f;
-  ScaleWeightSampler sampler;
-  ScaleWeightSamplerHWY samplerHWY;
-};
-
-template<WeightedRow6Operation op, int Components>
-class WeightedWindow6RowSampler16Bit : public ScaleRowSampler<uint16_t> {
- public:
-  WeightedWindow6RowSampler16Bit(const uint16_t *mSource, const int srcStride,
-                                 const int inputWidth, const int inputHeight,
-                                 uint16_t *mDestination, const int dstStride,
-                                 const int outputWidth, const int outputHeight,
-                                 const int depth) :
-      ScaleRowSampler<uint16_t>(mSource,
-                                srcStride,
-                                inputWidth, inputHeight,
-                                mDestination, dstStride,
-                                outputWidth, outputHeight),
-      maxColors(::powf(2.0f, static_cast<float>(depth)) - 1.0f) {
-    switch (op) {
-      case WEIGHTED_ROW6_LANCZOS_SINC: {
-        sampler = sparkyuv::Lanczos3Sinc;
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-    auto dst = reinterpret_cast<uint16_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    const int components = Components;
-
-    for (int x = 0; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      const int a = 3;
-      float rgb[components];
-      std::fill(rgb, rgb + components, 0.0f);
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      float weightSum(0.0f);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-          weightSum += weight;
-
-          auto row = reinterpret_cast<const uint16_t *>(reinterpret_cast<const uint8_t *>(this->mSource)
-              + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1) * components;
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-          for (int c = 0; c < components; ++c) {
-            auto clrf = static_cast<float>(row[px + c]);
-            float clr = clrf * weight;
-            rgb[c] += clr;
-          }
-        }
-      }
-
-      const int px = x * components;
-      const float invWeightScale = weightSum != 0.f ? 1.f / weightSum : 0.f;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int c = 0; c < components; ++c) {
-        dst[px + c] = static_cast<uint8_t>(std::clamp(::roundf(rgb[c] * invWeightScale), 0.0f, maxColors));
-      }
-    }
-  }
-
-  ~WeightedWindow6RowSampler16Bit() override = default;
-
- private:
-  const float maxColors;
-  ScaleWeightSampler sampler;
-};
-
-template<WeightedRow6Operation op, int Components>
-class WeightedWindow6RowSamplerF16Bit : public ScaleRowSampler<uint16_t> {
- public:
-  WeightedWindow6RowSamplerF16Bit(const uint16_t *mSource,
-                                  const int srcStride,
-                                  const int inputWidth,
-                                  const int inputHeight,
-                                  uint16_t *mDestination,
-                                  const int dstStride,
-                                  const int outputWidth,
-                                  const int outputHeight) :
-      ScaleRowSampler<uint16_t>(mSource, srcStride,
-                                inputWidth, inputHeight,
-                                mDestination, dstStride,
-                                outputWidth, outputHeight) {
-    switch (op) {
-      case WEIGHTED_ROW6_LANCZOS_SINC: {
-        sampler = sparkyuv::Lanczos3Sinc;
-#if WEIGHTED_WINDOW6_HWY
-        samplerHWY = Lanczos3Sinc;
-#endif
-      }
-        break;
-    }
-  }
-
-  void sample(const int y) override {
-#if WEIGHTED_WINDOW6_HWY
-    const FixedTag<hwy::float32_t, 4> dfx4;
-    const FixedTag<int32_t, 4> dix4;
-    const FixedTag<hwy::float16_t, 4> df16x4;
-    using VI4 = Vec<decltype(dix4)>;
-    using VF4 = Vec<decltype(dfx4)>;
-    using VF16x4 = Vec<decltype(df16x4)>;
-    const int mMaxWidth = this->inputWidth - 1;
-#endif
-
-    const auto src8 = reinterpret_cast<const uint8_t *>(this->mSource);
-    auto dst16 = reinterpret_cast<uint16_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
-
-    const int components = Components;
-
-    uint32_t x = 0;
-
-#if WEIGHTED_WINDOW6_HWY
-#if !NOACCELERATED_SAMPLER
-    for (; x + 8 < this->outputWidth && components == 4; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      const int a = 3;
-      float rgb[components];
-      std::fill(rgb, rgb + components, 0.0f);
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      float kWeightSum = 0;
-      VF4 color = Set(dfx4, 0);
-
-      const int appendixLow[4] = {-2, -1, 0, 1};
-      const int appendixHigh[4] = {2, 3, 0, 0};
-
-      const VF4 aVector = Set(dfx4, a);
-      VF4 srcXV = Set(dfx4, srcX);
-      VI4 kx1V = Set(dix4, kx1);
-      const VI4 appendixLowV = LoadU(dix4, appendixLow);
-      const VI4 appendixHighV = LoadU(dix4, appendixHigh);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        auto row =
-            reinterpret_cast<const uint16_t *>(src8 + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-        VF4 yWeightV = Set(dfx4, yWeight);
-        VI4 xi = Add(kx1V, appendixLowV);
-        VF4 dx = Sub(srcXV, ConvertTo(dfx4, xi));
-        VF4 sampleParameter = dx;
-        VF4 weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV);
-        kWeightSum += ExtractLane(SumOfLanes(dfx4, weights), 0);
-        for (int i = 0; i < 4; ++i) {
-          int sizeXPos = clamp(ExtractLane(xi, i), 0, mMaxWidth) * components;
-          VF16x4 r1 = LoadU(df16x4, reinterpret_cast<const hwy::float16_t *>(&row[sizeXPos]));
-          VF4 fr1 = PromoteTo(dfx4, r1);
-          fr1 = Mul(fr1, Set(dfx4, ExtractLane(weights, i)));
-          color = Add(color, fr1);
-        }
-
-        xi = Add(kx1V, appendixHighV);
-        dx = Sub(srcXV, ConvertTo(dfx4, xi));
-        sampleParameter = dx;
-        weights = Mul(samplerHWY(dfx4, sampleParameter), yWeightV);
-        for (int i = 0; i < 2; ++i) {
-          int sizeXPos = clamp(ExtractLane(xi, i), 0, mMaxWidth) * components;
-          VF16x4 r1 = LoadU(df16x4,
-                            reinterpret_cast<const hwy::float16_t *>(&row[sizeXPos]));
-          VF4 fr1 = PromoteTo(dfx4, r1);
-          float weight = ExtractLane(weights, i);
-          kWeightSum += weight;
-          fr1 = Mul(fr1, Set(dfx4, weight));
-          color = Add(color, fr1);
-        }
-      }
-
-      if (kWeightSum == 0) {
-        VF16x4 f16Color = DemoteTo(df16x4, color);
-        StoreU(f16Color, df16x4, reinterpret_cast<hwy::float16_t *>(&dst16[x * components]));
-      } else {
-        VF16x4 f16Color = DemoteTo(df16x4, Div(color, Set(dfx4, kWeightSum)));
-        StoreU(f16Color, df16x4, reinterpret_cast<hwy::float16_t *>(&dst16[x * components]));
-      }
-    }
-#endif
-#endif
-
-    for (; x < this->outputWidth; ++x) {
-      float srcX = (float) x * this->xScale;
-      float srcY = (float) y * this->yScale;
-
-      const int a = 3;
-      float rgb[components];
-      std::fill(rgb, rgb + components, 0.0f);
-
-      float kx1 = ::floorf(srcX);
-      float ky1 = ::floorf(srcY);
-
-      float weightSum(0.0f);
-
-      for (int j = -a + 1; j <= a; j++) {
-        int yj = (int) ky1 + j;
-        float dy = float(srcY) - (float(ky1) + (float) j);
-        float yWeight = sampler(dy);
-        for (int i = -a + 1; i <= a; i++) {
-          int xi = (int) kx1 + i;
-          float dx = float(srcX) - (float(kx1) + (float) i);
-          float weight = sampler(dx) * yWeight;
-          weightSum += weight;
-
-          auto row =
-              reinterpret_cast<const uint16_t *>(src8 + std::clamp(yj, 0, this->inputHeight - 1) * this->srcStride);
-
-          const int px = std::clamp(xi, 0, this->inputWidth - 1) * components;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-          for (int c = 0; c < components; ++c) {
-            float clrf = hwy::F32FromF16(hwy::float16_t::FromBits(row[px + c]));
-            float clr = (float) clrf * weight;
-            rgb[c] += clr;
-          }
-        }
-      }
-
-      const int px = x * components;
-      const float invWeightScale = weightSum != 0.f ? 1.f / weightSum : 0.f;
-
-#if defined(__clang__)
-#pragma clang loop unroll(full)
-#endif
-      for (int c = 0; c < components; ++c) {
-        dst16[px + c] = hwy::F16FromF32(rgb[c] * invWeightScale).bits;
-      }
-    }
-  }
-
-  ~WeightedWindow6RowSamplerF16Bit() override = default;
-
- private:
-  const float maxColors = ::powf(2.0f, (float) 8.f) - 1.0f;
-  ScaleWeightSampler sampler;
-
-#if WEIGHTED_WINDOW6_HWY
-  typedef Vec<FixedTag<float32_t, 4>> (*ScaleWeightSamplerHWY)(FixedTag<float32_t, 4>, Vec<FixedTag<float32_t, 4>>);
-  ScaleWeightSamplerHWY samplerHWY;
-#endif
-};
-}
-HWY_AFTER_NAMESPACE();
-
-#undef WEIGHTED_WINDOW6_HWY
-
-#endif
diff --git a/src/sampler/sampler-inl.h b/src/sampler/sampler-inl.h
deleted file mode 100644
index c0ab03f..0000000
--- a/src/sampler/sampler-inl.h
+++ /dev/null
@@ -1,406 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(SPARKYUV_SAMPLER_INL_H) == defined(HWY_TARGET_TOGGLE)
-#ifdef SPARKYUV_SAMPLER_INL_H
-#undef SPARKYUV_SAMPLER_INL_H
-#else
-#define SPARKYUV_SAMPLER_INL_H
-#endif
-
-#include "hwy/highway.h"
-#include "src/math/math-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace sparkyuv::HWY_NAMESPACE {
-using hwy::HWY_NAMESPACE::Set;
-using hwy::HWY_NAMESPACE::FixedTag;
-using hwy::HWY_NAMESPACE::Vec;
-using hwy::HWY_NAMESPACE::Abs;
-using hwy::HWY_NAMESPACE::Mul;
-using hwy::HWY_NAMESPACE::Div;
-using hwy::HWY_NAMESPACE::Max;
-using hwy::HWY_NAMESPACE::Min;
-using hwy::HWY_NAMESPACE::Add;
-using hwy::HWY_NAMESPACE::Zero;
-using hwy::HWY_NAMESPACE::BitCast;
-using hwy::HWY_NAMESPACE::ConvertTo;
-using hwy::HWY_NAMESPACE::PromoteTo;
-using hwy::HWY_NAMESPACE::DemoteTo;
-using hwy::HWY_NAMESPACE::Combine;
-using hwy::HWY_NAMESPACE::Rebind;
-using hwy::HWY_NAMESPACE::Sub;
-using hwy::HWY_NAMESPACE::LowerHalf;
-using hwy::HWY_NAMESPACE::UpperHalf;
-using hwy::HWY_NAMESPACE::LoadInterleaved4;
-using hwy::HWY_NAMESPACE::StoreInterleaved4;
-using hwy::HWY_NAMESPACE::IfThenZeroElse;
-using hwy::float16_t;
-using hwy::float32_t;
-
-using hwy::HWY_NAMESPACE::NegMulAdd;
-using hwy::HWY_NAMESPACE::MulAdd;
-using hwy::HWY_NAMESPACE::IfThenElse;
-using hwy::HWY_NAMESPACE::MulSub;
-using hwy::HWY_NAMESPACE::ApproximateReciprocal;
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T
-BCSplinePartOne(const D df, T x, const T B, const T C, const T tripled, const T doubled) {
-  x = Abs(x);
-  T mult = Set(df, 1.0f / 6.0f);
-  T r1 = NegMulAdd(Set(df, 9), B, NegMulAdd(Set(df, 6.0), C, Set(df, 12)));
-  T r2 = MulAdd(Set(df, 6), C, MulSub(Set(df, 12.0f), B, Set(df, 18.0f)));
-  T r3 = NegMulAdd(Set(df, 2), B, Set(df, 6));
-  return Mul(MulAdd(r1, tripled, MulAdd(r2, doubled, r3)), mult);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T
-BCSplinePartTwo(const D df, T x, const T B, const T C, const T tripled, const T doubled) {
-  x = Abs(x);
-  T mult = Set(df, 1.0f / 6.0f);
-  T r1 = MulSub(Set(df, -6.0f), C, B);
-  T r2 = MulAdd(Set(df, 6.0), B, Mul(Set(df, 30), C));
-  T r3 = MulSub(Set(df, -12), B, Mul(Set(df, 48), C));
-  T r4 = MulAdd(Set(df, 8.0), B, Mul(Set(df, 24.0f), C));
-  T rr = MulAdd(r1, tripled, MulAdd(r2, doubled, MulAdd(r3, x, r4)));
-  return Mul(rr, mult);
-}
-
-template<class D, typename V = Vec<D>>
-HWY_MATH_INLINE V BCSpline(const D df, V x, const V B, const V C) {
-  x = Abs(x);
-  const V zeros = Zero(df);
-  const V ones = Set(df, 1.0);
-  const V two = Set(df, 2.0);
-  const V doubled = Mul(x, x);
-  const V tripled = Mul(doubled, x);
-  auto setZeroMask = x > two;
-  auto setP1Mask = x < ones;
-  auto setP2Mask = x >= ones;
-  V res = Zero(df);
-  const V p1 = BCSplinePartOne(df, x, B, C, tripled, doubled);
-  const V p2 = BCSplinePartTwo(df, x, B, C, tripled, doubled);
-  res = IfThenElse(setP1Mask, p1, zeros);
-  res = IfThenElse(setP2Mask, p2, res);
-  res = IfThenElse(setZeroMask, zeros, res);
-  return res;
-}
-
-using hwy::HWY_NAMESPACE::InsertLane;
-using hwy::HWY_NAMESPACE::ExtractLane;
-using hwy::HWY_NAMESPACE::LoadU;
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T MitchellNetravaliV(const D df, T d) {
-  const T C = Set(df, 1.0 / 3.0);
-  const T B = Set(df, 1.0 / 3.0);
-  return BCSpline<D, T>(df, d, B, C);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T CubicHermiteV(const D df, T d) {
-  const T C = Set(df, 0.0);
-  const T B = Set(df, 0.0);
-  return BCSpline<D, T>(df, d, B, C);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T CubicBSplineV(const D df, T d) {
-  const T C = Set(df, 0.0);
-  const T B = Set(df, 1.0);
-  return BCSpline<D, T>(df, d, B, C);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T BiCubicSplineV(const D df, T x) {
-  const hwy::HWY_NAMESPACE::TFromD<D> a = -0.5;
-  const T aVec = Set(df, a);
-  const T ones = Set(df, 1.0);
-  const T two = Set(df, 2.0);
-  const T three = Set(df, 3.0);
-  const T four = Set(df, 4.0);
-  const T five = Set(df, 5.0);
-  const T eight = Set(df, 8.0);
-  const T zeros = Zero(df);
-  x = Abs(x);
-  const auto zeroMask = x >= two;
-  const auto partOneMask = x < ones;
-  const T doubled = Mul(x, x);
-  const T triplet = Mul(doubled, x);
-
-  const T partOne = MulAdd(Add(two, aVec), triplet, NegMulAdd(Add(aVec, three), doubled, ones));
-  const T fourA = Mul(four, aVec);
-  const T eightA = Mul(eight, aVec);
-  const T fiveA = Mul(five, aVec);
-  const T partTwo = MulAdd(aVec, triplet,
-                           NegMulAdd(fiveA, doubled,
-                                     MulSub(eightA, x, fourA)));
-
-  x = IfThenElse(partOneMask, partOne, partTwo);
-  x = IfThenElse(zeroMask, zeros, x);
-
-  return x;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T SimpleCubicV(const D df, T x) {
-  x = Abs(x);
-  const T zeros = Zero(df);
-  const T ones = Set(df, 1.0);
-  const T two = Set(df, 2.0);
-  const T doubled = Mul(x, x);
-  const T tripled = Mul(doubled, x);
-  auto setZeroMask = x > two;
-  auto setP1Mask = x < ones;
-  auto setP2Mask = x >= ones;
-  const T mSix = Set(df, 6.0f);
-  const T sixScale = ApproximateReciprocal(mSix);
-  T res = Zero(df);
-  const T p1 = Mul(MulAdd(MulSub(Set(df, 3), x, mSix), Mul(x, x), Set(df, 4.0f)), sixScale);
-  const T p2 = Mul(MulAdd(MulSub(Sub(mSix, x), x, Set(df, 12.0f)), x, Set(df, 8.0f)), sixScale);
-  res = IfThenElse(setP1Mask, p1, zeros);
-  res = IfThenElse(setP2Mask, p2, res);
-  res = IfThenElse(setZeroMask, zeros, res);
-  return res;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T sincV(const D d, T x) {
-  const T ones = Set(d, 1);
-  const T zeros = Zero(d);
-  auto maskEqualToZero = x == zeros;
-  T sine = hwy::HWY_NAMESPACE::Sin(d, x);
-  x = IfThenElse(maskEqualToZero, ones, x);
-  T result = Div(sine, x);
-  result = IfThenElse(maskEqualToZero, ones, result);
-  return result;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T LanczosWindowHWY(const D df, T x, const T a) {
-  auto mask = Abs(x) >= a;
-  T v = Mul(Set(df, M_PI), x);
-  T r = Mul(sincV(df, v), sincV(df, Div(v, a)));
-  return IfThenZeroElse(mask, r);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T CatmullRomV(const D df, T d) {
-  const T C = Set(df, 0.0);
-  const T B = Set(df, 0.5);
-  return BCSpline<D, T>(df, d, B, C);
-}
-
-using hwy::HWY_NAMESPACE::Lerp;
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T Blerp(const D df, T c00, T c10, T c01, T c11, T tx, T ty) {
-  return Lerp(df, Lerp(df, c00, c10, tx), Lerp(df, c01, c11, tx), ty);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T HannWindow(const D df, const T n, const float length) {
-  const float size = length * 2;
-  const T sizeV = Set(df, size);
-  const T lengthV = Set(df, length);
-  auto mask = Abs(n) > Set(df, length);
-  const T piMulSize = Set(df, M_PI / size);
-  T res = hwy::HWY_NAMESPACE::Cos(df, Mul(piMulSize, n));
-  res = Mul(Mul(res, res), ApproximateReciprocal(sizeV));
-  res = IfThenZeroElse(mask, res);
-  return res;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T J1(const D df, T x) {
-  T p = Set(df, 0.270112271089232341485679099e+4);
-  T q = Set(df, 0.1e+1);
-
-  const auto dX = Mul(x, x);
-
-  p = MulAdd(p, dX, Set(df, -0.4695753530642995859767162166e+7));
-  q = MulAdd(q, dX, Set(df, 0.1606931573481487801970916749e+4));
-
-  p = MulAdd(p, dX, Set(df, 0.3413234182301700539091292655e+10));
-  q = MulAdd(q, dX, Set(df, 0.1501793594998585505921097578e+7));
-
-  p = MulAdd(p, dX, Set(df, -0.1322983480332126453125473247e+13));
-  q = MulAdd(q, dX, Set(df, 0.1013863514358673989967045588e+10));
-
-  p = MulAdd(p, dX, Set(df, 0.2908795263834775409737601689e+15));
-  q = MulAdd(q, dX, Set(df, 0.5243710262167649715406728642e+12));
-
-  p = MulAdd(p, dX, Set(df, -0.3588817569910106050743641413e+17));
-  q = MulAdd(q, dX, Set(df, 0.2081661221307607351240184229e+15));
-
-  p = MulAdd(p, dX, Set(df, 0.2316433580634002297931815435e+19));
-  q = MulAdd(q, dX, Set(df, 0.6092061398917521746105196863e+17));
-
-  p = MulAdd(p, dX, Set(df, -0.6672106568924916298020941484e+20));
-  q = MulAdd(q, dX, Set(df, 0.1185770712190320999837113348e+20));
-
-  p = MulAdd(p, dX, Set(df, 0.581199354001606143928050809e+21));
-  q = MulAdd(q, dX, Set(df, 0.11623987080032122878585294e+22));
-
-  const auto zeros = Zero(df);
-  const auto ones = Set(df, 1.0f);
-  q = IfThenElse(q == zeros, ones, q);
-
-  return Div(p, q);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T Q1(const D df, T x) {
-  static const float
-      Pone[] = {
-      0.3511751914303552822533318e+3,
-      0.7210391804904475039280863e+3,
-      0.4259873011654442389886993e+3,
-      0.831898957673850827325226e+2,
-      0.45681716295512267064405e+1,
-      0.3532840052740123642735e-1
-  },
-      Qone[] = {
-      0.74917374171809127714519505e+4,
-      0.154141773392650970499848051e+5,
-      0.91522317015169922705904727e+4,
-      0.18111867005523513506724158e+4,
-      0.1038187585462133728776636e+3,
-      0.1e+1
-  };
-
-  T p = Set(df, Pone[5]);
-  T q = Set(df, Qone[5]);
-
-  const auto zeros = Zero(df);
-  const auto ones = Set(df, 1.0f);
-
-  const auto eights = Set(df, 8.0);
-
-  x = IfThenElse(x == zeros, ones, x);
-
-  const auto recX = Div(eights, x);
-
-  const auto dX = Mul(recX, recX);
-
-  for (int i = 4; i >= 0; i--) {
-    p = MulAdd(p, dX, Set(df, Pone[i]));
-    q = MulAdd(q, dX, Set(df, Pone[i]));
-  }
-
-  q = IfThenElse(q == zeros, ones, q);
-
-  auto res = Div(p, q);
-  res = IfThenElse(x == zeros, zeros, res);
-  return res;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T P1(const D df, T x) {
-  static const float
-      Pone[] = {
-      0.352246649133679798341724373e+5,
-      0.62758845247161281269005675e+5,
-      0.313539631109159574238669888e+5,
-      0.49854832060594338434500455e+4,
-      0.2111529182853962382105718e+3,
-      0.12571716929145341558495e+1
-  },
-      Qone[] = {
-      0.352246649133679798068390431e+5,
-      0.626943469593560511888833731e+5,
-      0.312404063819041039923015703e+5,
-      0.4930396490181088979386097e+4,
-      0.2030775189134759322293574e+3,
-      0.1e+1
-  };
-
-  T p = Set(df, Pone[5]);
-  T q = Set(df, Qone[5]);
-
-  const auto zeros = Zero(df);
-  const auto ones = Set(df, 1.0f);
-
-  const auto eights = Set(df, 8.0);
-
-  x = IfThenElse(x == zeros, ones, x);
-
-  const auto recX = Div(eights, x);
-
-  const auto dX = Mul(recX, recX);
-
-  for (int i = 4; i >= 0; i--) {
-    p = MulAdd(p, dX, Set(df, Pone[i]));
-    q = MulAdd(q, dX, Set(df, Pone[i]));
-  }
-
-  q = IfThenElse(q == zeros, ones, q);
-
-  auto res = Div(p, q);
-  res = IfThenElse(x == zeros, zeros, res);
-  return res;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T BesselOrderOne(const D df, T x) {
-  auto p = x;
-  x = Abs(x);
-  const auto minZValue = Set(df, 1e-8);
-  auto zerosMask = x < minZValue;
-  auto res = Mul(J1(df, x), p);
-  const auto zeros = Zero(df);
-  res = IfThenElse(zerosMask, zeros, res);
-  return res;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T jinc(const D d, T x) {
-  const T ones = Set(d, 1);
-  const T zeros = Zero(d);
-  auto maskEqualToZero = x == zeros;
-  const auto minZValue = Set(d, 1e-8);
-  auto zerosMask = x < minZValue;
-  x = IfThenElse(maskEqualToZero, ones, x);
-  const T pi = Set(d, M_PI);
-  T result = Div(BesselOrderOne(d, Mul(pi, x)), x);
-  result = IfThenElse(maskEqualToZero, zeros, result);
-  result = IfThenElse(zerosMask, Set(d, 0.5 * M_PI), result);
-  return result;
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T LanczosJinc(const D df, T x, const T a) {
-  auto mask = Abs(x) >= a;
-  T v = Mul(Set(df, M_PI), x);
-  T r = Mul(jinc(df, v), jinc(df, Div(v, a)));
-  return IfThenZeroElse(mask, r);
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T Lanczos3Jinc(const D df, T x) {
-  return LanczosJinc(df, x, Set(df, 3.0f));
-}
-
-template<class D, typename T = Vec<D>>
-HWY_MATH_INLINE T Lanczos3Sinc(const D df, T x) {
-  return LanczosWindowHWY(df, x, Set(df, 3.0f));
-}
-}
-HWY_AFTER_NAMESPACE();
-
-#endif
\ No newline at end of file
diff --git a/src/sampler/sampler.h b/src/sampler/sampler.h
deleted file mode 100644
index ff5598e..0000000
--- a/src/sampler/sampler.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// Copyright (C) 2024 Radzivon Bartoshyk
-//
-// This file belongs to sparkyuv project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARKYUV_SAMPLER_H_ONCE
-#define SPARKYUV_SAMPLER_H_ONCE
-
-#ifdef _MSC_VER
-#define _USE_MATH_DEFINES
-#include <cmath>
-#endif
-#include <cmath>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-namespace sparkyuv {
-using namespace std;
-
-template<typename D, typename T>
-static inline D PromoteTo(T t, float maxColors) {
-  D result = static_cast<D>((float) t / maxColors);
-  return result;
-}
-
-template<typename D, typename T>
-static inline D DemoteTo(T t, float maxColors) {
-  return (D) clamp(((float) t * (float) maxColors), 0.0f, (float) maxColors);
-}
-
-template<typename T>
-static inline float BCSpline(T x, const T B, const T C) {
-  if (x < 0.0f) x = -x;
-
-  const T dp = x * x;
-  const T tp = dp * x;
-
-  if (x < 1.0f)
-    return ((12 - 9 * B - 6 * C) * tp + (-18 + 12 * B + 6 * C) * dp + (6 - 2 * B)) *
-        (T(1) / T(6));
-  else if (x < 2.0f)
-    return ((-B - 6 * C) * tp + (6 * B + 30 * C) * dp + (-12 * B - 48 * C) * x +
-        (8 * B + 24 * C)) * (T(1) / T(6));
-
-  return (0.0f);
-}
-
-template<typename T>
-static inline T SimpleCubic(T x) {
-  if (x < 0.0f) x = -x;
-
-  if (x < 1.0f)
-    return (4.0f + x * x * (3.0f * x - 6.0f)) / 6.0f;
-  else if (x < 2.0f)
-    return (8.0f + x * (-12.0f + x * (6.0f - x))) / 6.0f;
-
-  return (0.0f);
-}
-
-template<typename T>
-static inline T BiCubicSpline(T x) {
-  const T a = -0.5;
-  const T modulo = abs(x);
-  if (modulo >= 2) {
-    return 0;
-  }
-  const T floatd = modulo * modulo;
-  const T triplet = floatd * modulo;
-  if (modulo <= 1) {
-    return (a + T(2.0))*triplet - (a + T(3.0)) * floatd + T(1.0);
-  }
-  return a * triplet - T(5.0) * a * floatd + T(8.0) * a * modulo - T(4.0) * a;
-}
-
-template<typename T>
-static inline T CubicHermite(T x) {
-  constexpr T C = T(0.0);
-  constexpr T B = T(0.0);
-  return BCSpline(x, B, C);
-}
-
-template<typename T>
-static inline float BSpline(T x) {
-  constexpr T C = T(0.0);
-  constexpr T B = T(1.0);
-  return BCSpline(x, B, C);
-}
-
-template<typename T>
-static inline float MitchellNetravalli(T x) {
-  constexpr T B = 1.0f / 3.0f;
-  constexpr T C = 1.0f / 3.0f;
-  return BCSpline(x, B, C);
-}
-
-template<typename T>
-static inline T sinc(T x) {
-  if (x == 0.0) {
-    return T(1.0);
-  } else {
-    return sin(x) / x;
-  }
-}
-
-template<typename T>
-static inline T LanczosWindow(T x, const T a) {
-  if (abs(x) < a) {
-    return sinc(T(M_PI) * x) * sinc(T(M_PI) * x / a);
-  }
-  return T(0.0);
-}
-
-template<typename T>
-static inline T fastCos(T x) {
-  constexpr T C0 = 0.99940307;
-  constexpr T C1 = -0.49558072;
-  constexpr T C2 = 0.03679168;
-  constexpr T C3 = -0.00434102;
-
-  while (x < -2 * M_PI) {
-    x += 2.0 * M_PI;
-  }
-  while (x > 2 * M_PI) {
-    x -= 2.0 * M_PI;
-  }
-
-  // Calculate cos(x) using Chebyshev polynomial approximation
-  T x2 = x * x;
-  T result = C0 + x2 * (C1 + x2 * (C2 + x2 * C3));
-  return result;
-}
-
-template<typename T>
-static inline T CatmullRom(T x) {
-  return BCSpline(x, 0.0f, 0.5f);
-}
-
-template<typename T>
-static inline T HannWindow(const T n, const T length) {
-  const T size = length * 2;
-  const T part = M_PI / size;
-  if (abs(n) > length) {
-    return 0;
-  }
-  T r = cos(n * part);
-  r = r * r;
-  return r / size;
-}
-
-template<typename T>
-static inline T blerp(T c00, T c10, T c01, T c11, T tx, T ty) {
-  return lerp(lerp(c00, c10, tx), lerp(c01, c11, tx), ty);
-}
-
-template<typename T>
-static inline T Lanczos3Sinc(T x) {
-  return LanczosWindow(x, 3.0f);
-}
-
-}
-#endif  // SPARKYUV_SAMPLER_H_ONCE
\ No newline at end of file
diff --git a/tools/bench.h b/tools/bench.h
index 89adeb6..a1611b7 100644
--- a/tools/bench.h
+++ b/tools/bench.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <string>
+#include <functional>
 
 static void bench(int iterations, const char *color, const char *mark, const std::function<void()> &func) {
   double totalTime = 0;
diff --git a/tools/bench/YuvBenchmarkBase.cpp b/tools/bench/YuvBenchmarkBase.cpp
index 585efd6..6e835c4 100644
--- a/tools/bench/YuvBenchmarkBase.cpp
+++ b/tools/bench/YuvBenchmarkBase.cpp
@@ -267,4 +267,4 @@ void SparkyuvFastGuassianRGBA(benchmark::State &state) {
   for (auto _ : state) {
     sparkyuv::FastGaussianBlurRGBA(reinterpret_cast<uint8_t *>(rgbaData.data()), rgbaStride, inWidth, inHeight, 15);
   }
-}
\ No newline at end of file
+}
diff --git a/tools/main.cpp b/tools/main.cpp
index 88fe41b..0b30904 100644
--- a/tools/main.cpp
+++ b/tools/main.cpp
@@ -317,10 +317,6 @@ int main() {
     sparkyuv::TransposeClockwiseRGBA(rgbaData.data(), rgbaStride, transposed.data(), trnsStride, width, height);
   });
 
-  bench(1, ANSI_COLOR_YELLOW, "Fast Blur", [&]() {
-    sparkyuv::FastGaussianBlurRGBA(rgbaData.data(), rgbaStride, width, height, 25);
-  });
-
 //  std::vector<uint8_t> f16Store(width * 4 * sizeof(uint16_t) * height);
 //
 //  sparkyuv::RGBAToRGBAF16(rgbaData.data(),