From 3f3f3a0542e2dd5d823658d33bcf401cfe3899c2 Mon Sep 17 00:00:00 2001
From: Xue Zhang <quic_xuezha@quicinc.com>
Date: Sun, 8 Dec 2024 17:58:16 +0530
Subject: [PATCH 01/11] Prepare for second upstream

Add gaussianBlur, filter2d, sepFilter2d,sobel and warpPerspective into fastcv module
---
 modules/fastcv/README.md                      |   3 +-
 modules/fastcv/include/opencv2/fastcv.hpp     |   5 +-
 .../fastcv/include/opencv2/fastcv/blur.hpp    |  55 +++
 .../fastcv/include/opencv2/fastcv/edges.hpp   |  42 ++
 .../fastcv/include/opencv2/fastcv/warp.hpp    |  32 ++
 modules/fastcv/perf/perf_blur.cpp             | 123 ++++++
 modules/fastcv/perf/perf_edges.cpp            |  68 ++++
 modules/fastcv/perf/perf_warp.cpp             |  62 +++
 modules/fastcv/src/blur.cpp                   | 368 ++++++++++++++++++
 modules/fastcv/src/edges.cpp                  | 121 ++++++
 modules/fastcv/src/warp.cpp                   |  71 ++++
 modules/fastcv/test/test_blur.cpp             | 129 ++++++
 modules/fastcv/test/test_edges.cpp            |  74 ++++
 modules/fastcv/test/test_warp.cpp             |  68 ++++
 14 files changed, 1218 insertions(+), 3 deletions(-)
 create mode 100644 modules/fastcv/include/opencv2/fastcv/blur.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/edges.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/warp.hpp
 create mode 100644 modules/fastcv/perf/perf_blur.cpp
 create mode 100644 modules/fastcv/perf/perf_edges.cpp
 create mode 100644 modules/fastcv/perf/perf_warp.cpp
 create mode 100644 modules/fastcv/src/blur.cpp
 create mode 100644 modules/fastcv/src/edges.cpp
 create mode 100644 modules/fastcv/src/warp.cpp
 create mode 100644 modules/fastcv/test/test_blur.cpp
 create mode 100644 modules/fastcv/test/test_edges.cpp
 create mode 100644 modules/fastcv/test/test_warp.cpp

diff --git a/modules/fastcv/README.md b/modules/fastcv/README.md
index 0c7323c086c..076a4108de0 100644
--- a/modules/fastcv/README.md
+++ b/modules/fastcv/README.md
@@ -3,5 +3,4 @@ FastCV extension for OpenCV
 
 This module provides wrappers for several FastCV functions not covered by the corresponding HAL in OpenCV or have implementation incompatible with OpenCV.
 Please note that:
-1. This module supports ARM architecture only. This means that CMake script aborts configuration under x86 platform even if you don't want to build binaries for your machine and just want to build docs or enable code analysis in your IDE. In that case you should fix CMakeLists.txt file as told inside it.
-2. Test data is stored in misc folder. Before running tests on a device you should copy the content of `misc/` folder to `$YOUR_TESTDATA_PATH/fastcv/` folder on a device.
+1. This module supports ARM architecture only. This means that CMake script will not configure or build under x86 platform.
\ No newline at end of file
diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
index fcf0bf132fb..a2129ffbb86 100644
--- a/modules/fastcv/include/opencv2/fastcv.hpp
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -10,8 +10,10 @@
 
 #include "opencv2/fastcv/arithm.hpp"
 #include "opencv2/fastcv/bilateralFilter.hpp"
+#include "opencv2/fastcv/blur.hpp"
 #include "opencv2/fastcv/cluster.hpp"
 #include "opencv2/fastcv/draw.hpp"
+#include "opencv2/fastcv/edges.hpp"
 #include "opencv2/fastcv/fast10.hpp"
 #include "opencv2/fastcv/fft.hpp"
 #include "opencv2/fastcv/hough.hpp"
@@ -22,6 +24,7 @@
 #include "opencv2/fastcv/shift.hpp"
 #include "opencv2/fastcv/smooth.hpp"
 #include "opencv2/fastcv/thresh.hpp"
+#include "opencv2/fastcv/warp.hpp"
 
 /**
  * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
@@ -29,4 +32,4 @@
  * @}
  */
 
-#endif // OPENCV_FASTCV_ARITHM_HPP
+#endif // OPENCV_FASTCV_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/blur.hpp b/modules/fastcv/include/opencv2/fastcv/blur.hpp
new file mode 100644
index 00000000000..424a11fa53b
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/blur.hpp
@@ -0,0 +1,55 @@
+#ifndef OPENCV_FASTCV_BLUR_HPP
+#define OPENCV_FASTCV_BLUR_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ */
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Gaussian blur with sigma = 0 and square kernel size
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1
+ * @param kernel_size Filer kernel size. One of 3, 5, 11
+ * @param blur_border Blur border or not
+ *
+ * @sa GaussianBlur
+ */
+CV_EXPORTS_W void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int kernel_size = 3, bool blur_border = true);
+
+/**
+ * @brief Filter an image with non-separable kernel
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1, CV_16SC1 or CV_32FC1
+ * @param ddepth The depth of output image
+ * @param _kernel Filer kernel data
+ *
+ * @sa Filter2D
+ */
+CV_EXPORTS_W void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel);
+
+/**
+ * @brief sepFilter an image with separable kernel.The way of handling overflow is different with OpenCV, this function will
+ * do right shift for the intermediate results and final result.
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1, CV_16SC1
+ * @param ddepth The depth of output image
+ * @param _kernelX Filer kernel data in x direction
+ * @param _kernelY Filer kernel data in Y direction (For CV_16SC1, the kernelX and kernelY should be same)
+ *
+ * @sa sepFilter2D
+ */
+CV_EXPORTS_W void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY);
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_BLUR_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/edges.hpp b/modules/fastcv/include/opencv2/fastcv/edges.hpp
new file mode 100644
index 00000000000..c8d67e9741e
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/edges.hpp
@@ -0,0 +1,42 @@
+#ifndef OPENCV_EDGES_HPP
+#define OPENCV_EDGES_HPP
+#include "opencv2/core/mat.hpp"
+
+namespace cv {
+namespace fastcv {
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ */
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Sobel filter with return dx and dy separately
+ * @param _src          Input image with type CV_8UC1
+ * @param _dx           X direction 1 order derivative with type CV_16SC1.
+ * @param _dy           Y direction 1 order derivative with type CV_16SC1 (same size with _dx).
+ * @param kernel_size   Sobel kernel size, support 3x3, 5x5, 7x7
+ * @param borderType    Border type
+ * @param borderValue   Border value for constant border
+*/
+CV_EXPORTS_W void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArray _dy, int kernel_size, int borderType,
+    int borderValue);
+
+/**
+ * @brief 3x3 Sobel filter without border
+ * @param _src          Input image with type CV_8UC1
+ * @param _dst          If _dsty is not needed, will store 8-bit result of |dx|+|dy|,
+ *                      otherwise will store the result of X direction 1 order derivative
+ * @param _dsty         If this param is needed, will store the result of Y direction 1 order derivative
+ * @param normalization If do normalization for the result
+*/
+CV_EXPORTS_W void sobel3x3u8(cv::InputArray _src, cv::OutputArray _dst, cv::OutputArray _dsty = noArray(), int ddepth = CV_8U,
+    bool normalization = false);
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/modules/fastcv/include/opencv2/fastcv/warp.hpp b/modules/fastcv/include/opencv2/fastcv/warp.hpp
new file mode 100644
index 00000000000..6704751c4cd
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/warp.hpp
@@ -0,0 +1,32 @@
+#ifndef OPENCV_WARP_HPP
+#define OPENCV_WARP_HPP
+#include "opencv2/core/mat.hpp"
+#include <opencv2/imgproc.hpp>
+namespace cv {
+namespace fastcv {
+
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+*/
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Perspective warp two images using the same transformation. Bi-linear interpolation is used where applicable
+ * @param _src1     The first input image data, type CV_8UC1
+ * @param _src2     The second input image data, type CV_8UC1
+ * @param _dst1     The first output image data, type CV_8UC1
+ * @param _dst2     The second output image data, type CV_8UC1
+ * @param _M0       The 3x3 perspective transformation matrix (inversed map)
+ * @param dsize     The output image size
+*/
+CV_EXPORTS_W void warpPerspective2Plane(cv::InputArray _src1, cv::InputArray _src2, cv::OutputArray _dst1,
+    cv::OutputArray _dst2, InputArray _M0, Size dsize);
+
+//! @}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_blur.cpp b/modules/fastcv/perf/perf_blur.cpp
new file mode 100644
index 00000000000..bca8f80974a
--- /dev/null
+++ b/modules/fastcv/perf/perf_blur.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int, bool>> GaussianBlurPerfTest;
+
+PERF_TEST_P(GaussianBlurPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S,CV_32S),                      // image depth
+                       ::testing::Values(3, 5),                                     // kernel size
+                       ::testing::Values(true,false)                                // blur border
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+    bool border = get<3>(GetParam());
+
+    // For some cases FastCV not support, so skip them
+    if((ksize!=5) && (depth!=CV_8U))
+        throw ::perf::TestBase::PerfSkipTestException();
+
+    cv::Mat src(srcSize, depth);
+    cv::Mat dst;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::gaussianBlur(src, dst, ksize, border);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> Filter2DPerfTest;
+
+PERF_TEST_P(Filter2DPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S,CV_32F),                      // dst image depth
+                       ::testing::Values(3, 5, 7, 9, 11)                            // kernel size
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel;
+    cv::Mat dst;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            kernel.create(ksize,ksize,CV_8S);
+            break;
+        }
+        case CV_32F:
+        {
+            kernel.create(ksize,ksize,CV_32F);
+            break;
+        }
+        default:
+            break;
+    }
+
+    cv::randu(src, 0, 256);
+    cv::randu(kernel, INT8_MIN, INT8_MAX);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::filter2D(src, dst, ddepth, kernel);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> SepFilter2DPerfTest;
+
+PERF_TEST_P(SepFilter2DPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S),                             // dst image depth
+                       ::testing::Values(3, 5, 7, 9, 11, 13, 15, 17)                // kernel size
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+
+    cv::Mat src(srcSize, ddepth);
+    cv::Mat kernel(1, ksize, ddepth);
+    cv::Mat dst;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sepFilter2D(src, dst, ddepth, kernel, kernel);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_edges.cpp b/modules/fastcv/perf/perf_edges.cpp
new file mode 100644
index 00000000000..74ffa552124
--- /dev/null
+++ b/modules/fastcv/perf/perf_edges.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int, int>> SobelPerfTest;
+
+PERF_TEST_P(SobelPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(3,5,7),                                    // kernel size
+                       ::testing::Values(BORDER_CONSTANT, BORDER_REPLICATE),        // border type
+                       ::testing::Values(0)                                         // border value
+                       )
+           )
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize = get<1>(GetParam());
+    int border = get<2>(GetParam());
+    int borderValue = get<3>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sobel(src,dx,dy,ksize,border,borderValue);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> Sobel3x3u8PerfTest;
+
+PERF_TEST_P(Sobel3x3u8PerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8S, CV_16S, CV_32F),                    // image depth
+                       ::testing::Values(0, 1)                                      // normalization
+                       )
+           )
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int normalization = get<2>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    if((normalization ==0) && (ddepth == CV_8S))
+        throw ::perf::TestBase::PerfSkipTestException();
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sobel3x3u8(src, dx, dy, ddepth, normalization);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+} //namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_warp.cpp b/modules/fastcv/perf/perf_warp.cpp
new file mode 100644
index 00000000000..231056aef56
--- /dev/null
+++ b/modules/fastcv/perf/perf_warp.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<Size> WarpPerspective2PlanePerfTest;
+
+PERF_TEST_P(WarpPerspective2PlanePerfTest, run,
+    ::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p))
+{
+    cv::Size dstSize = GetParam();
+    cv::Mat img = imread(cvtest::findDataFile("cv/shared/baboon.png"));
+    Mat src(img.rows, img.cols, CV_8UC1);
+    cvtColor(img,src,cv::COLOR_BGR2GRAY);
+    cv::Mat dst1, dst2, mat;
+    mat.create(3,3,CV_32FC1);
+    dst1.create(dstSize,CV_8UC1);
+    dst2.create(dstSize,CV_8UC1);
+
+    RNG& rng = cv::theRNG();
+    Point2f s[4], d[4];
+
+    s[0] = Point2f(0,0);
+    d[0] = Point2f(0,0);
+    s[1] = Point2f(src.cols-1.f,0);
+    d[1] = Point2f(dst1.cols-1.f,0);
+    s[2] = Point2f(src.cols-1.f,src.rows-1.f);
+    d[2] = Point2f(dst1.cols-1.f,dst1.rows-1.f);
+    s[3] = Point2f(0,src.rows-1.f);
+    d[3] = Point2f(0,dst1.rows-1.f);
+
+    float buffer[16];
+    Mat tmp( 1, 16, CV_32FC1, buffer );
+    rng.fill( tmp, 1, Scalar::all(0.), Scalar::all(0.1) );
+
+    for(int i = 0; i < 4; i++ )
+    {
+        s[i].x += buffer[i*4]*src.cols/2;
+        s[i].y += buffer[i*4+1]*src.rows/2;
+        d[i].x += buffer[i*4+2]*dst1.cols/2;
+        d[i].y += buffer[i*4+3]*dst1.rows/2;
+    }
+
+    cv::getPerspectiveTransform( s, d ).convertTo( mat, mat.depth() );
+    // Invert the perspective matrix
+    invert(mat,mat);
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::warpPerspective2Plane(src, src, dst1, dst2, mat, dstSize);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} //namespace
\ No newline at end of file
diff --git a/modules/fastcv/src/blur.cpp b/modules/fastcv/src/blur.cpp
new file mode 100644
index 00000000000..f092da96e0a
--- /dev/null
+++ b/modules/fastcv/src/blur.cpp
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvGaussianBlurLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvGaussianBlurLoop_Invoker(const uchar* _src_data, size_t _src_step, uchar* _dst_data, size_t _dst_step, int _width,
+        int _height, int _ksize, int _depth, fcvBorderType _fcvBorder, int _fcvBorderValue) :
+        cv::ParallelLoopBody(), src_data(_src_data), src_step(_src_step), dst_data(_dst_data), dst_step(_dst_step), width(_width),
+        height(_height), ksize(_ksize), depth(_depth), fcvBorder(_fcvBorder), fcvBorderValue(_fcvBorderValue)
+    {
+        half_ksize = ksize/2;
+        fcvFuncType = FCV_MAKETYPE(ksize,depth);
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        int topLines    = 0;
+        int rangeHeight = range.end-range.start;
+
+        if(range.start >= half_ksize)
+        {
+            topLines  += half_ksize;
+            rangeHeight += half_ksize;
+        }
+
+        if(range.end <= height-half_ksize)
+        {
+            rangeHeight += half_ksize;
+        }
+
+        const uchar* src = src_data + (range.start-topLines)*src_step;
+        uchar dst[dst_step*rangeHeight];
+
+        if (fcvFuncType == FCV_MAKETYPE(3,CV_8U))
+            fcvFilterGaussian3x3u8_v4(src, width, rangeHeight, src_step, dst, dst_step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_8U))
+            fcvFilterGaussian5x5u8_v3(src, width, rangeHeight, src_step, dst, dst_step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_16S))
+            fcvFilterGaussian5x5s16_v3((int16_t*)src, width, rangeHeight, src_step, (int16_t*)dst, dst_step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_32S))
+            fcvFilterGaussian5x5s32_v3((int32_t*)src, width, rangeHeight, src_step, (int32_t*)dst, dst_step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(11,CV_8U))
+            fcvFilterGaussian11x11u8_v2(src, width, rangeHeight, src_step, dst, dst_step, fcvBorder);
+
+        uchar* dptr = dst_data+range.start*dst_step;
+        uchar* sptr = dst+topLines*dst_step;
+        memcpy(dptr,sptr, (range.end-range.start)*dst_step);
+    }
+
+    private:
+    const uchar*    src_data;
+    const size_t    src_step;
+    uchar*          dst_data;
+    const size_t    dst_step;
+    const int       width;
+    const int       height;
+    const int       ksize;
+    const int       depth;
+    int             half_ksize;
+    int             fcvFuncType;
+    fcvBorderType   fcvBorder;
+    int             fcvBorderValue;
+
+    FcvGaussianBlurLoop_Invoker(const FcvGaussianBlurLoop_Invoker &);  // = delete;
+    const FcvGaussianBlurLoop_Invoker& operator= (const FcvGaussianBlurLoop_Invoker &);  // = delete;
+};
+
+void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int kernel_size, bool blur_border)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && CV_MAT_CN(_src.type()) == 1);
+
+    Size size = _src.size();
+    int type  = _src.type();
+    _dst.create( size, type );
+
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    int nStripes = src.rows / 80 == 0 ? 1 : src.rows / 80;
+
+    fcvBorderType fcvBorder = blur_border ? FASTCV_BORDER_ZERO_PADDING : FASTCV_BORDER_UNDEFINED;
+
+    if (((type == CV_8UC1)  && ((kernel_size == 3) || (kernel_size == 5) || (kernel_size == 11)))  ||
+        ((type == CV_16SC1) && (kernel_size == 5)) ||
+        ((type == CV_32SC1) && (kernel_size == 5)))
+    {
+        cv::parallel_for_(cv::Range(0, src.rows),
+            FcvGaussianBlurLoop_Invoker(src.data, src.step, dst.data, dst.step, src.cols, src.rows, kernel_size,
+            src.depth(), fcvBorder, 0), nStripes);
+    }
+    else
+        CV_Error(cv::Error::StsBadArg, cv::format("Src type %d, kernel size %d is not supported", type, kernel_size));
+}
+
+class FcvFilter2DLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvFilter2DLoop_Invoker(const uchar* _src_data, size_t _src_step, uchar* _dst_data, size_t _dst_step, const int _ddepth,
+        int _width, int _height, uchar* _kernel,int _ksize ) :
+        cv::ParallelLoopBody(), src_data(_src_data), src_step(_src_step), dst_data(_dst_data), dst_step(_dst_step),
+        ddepth(_ddepth), width(_width),height(_height), kernel(_kernel), ksize(_ksize)
+    {
+        half_ksize = ksize/2;
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        int topLines    = 0;
+        int rangeHeight = range.end-range.start;
+
+        if(range.start >= half_ksize)
+        {
+            topLines  += half_ksize;
+            rangeHeight += half_ksize;
+        }
+
+        if(range.end <= height-half_ksize)
+        {
+            rangeHeight += half_ksize;
+        }
+
+        const uchar *src = src_data + (range.start - topLines) * src_step;
+        uchar dst[dst_step*rangeHeight];
+
+        if (ddepth == CV_8U)
+            fcvFilterCorrNxNu8((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, dst, dst_step);
+        else if (ddepth == CV_16S)
+            fcvFilterCorrNxNu8s16((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, (int16_t*)dst, dst_step);
+        else if (ddepth == CV_32F)
+            fcvFilterCorrNxNu8f32((float32_t*)kernel, ksize, src, width, rangeHeight, src_step, (float32_t*)dst, dst_step);
+
+        uchar* dptr = dst_data+range.start*dst_step;
+        uchar* sptr = dst+topLines*dst_step;
+        memcpy(dptr, sptr, (range.end - range.start) * dst_step);
+    }
+
+    private:
+    const uchar*    src_data;
+    const size_t    src_step;
+    uchar*          dst_data;
+    const size_t    dst_step;
+    const int       ddepth;
+    const int       width;
+    const int       height;
+    uchar*          kernel;
+    const int       ksize;
+    int             half_ksize;
+
+    FcvFilter2DLoop_Invoker(const FcvFilter2DLoop_Invoker &);  // = delete;
+    const FcvFilter2DLoop_Invoker& operator= (const FcvFilter2DLoop_Invoker &);  // = delete;
+};
+
+void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Mat kernel = _kernel.getMat();
+    Size ksize = kernel.size();
+    CV_Assert(ksize.width == ksize.height);
+    CV_Assert(ksize.width % 2 == 1);
+
+    _dst.create(_src.size(), ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    int nStripes = src.rows / 80 == 0 ? 1 : src.rows / 80;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_8S);
+
+            cv::parallel_for_(cv::Range(0, src.rows),
+            FcvFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernel.data, ksize.width),
+            nStripes);
+            break;
+        }
+        case CV_32F:
+        {
+            CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_32F);
+
+            cv::parallel_for_(cv::Range(0, src.rows),
+            FcvFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernel.data, ksize.width),
+            nStripes);
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Kernel Size:%d, Dst type:%s is not supported", ksize.width,
+                depthToString(ddepth)));
+            break;
+        }
+    }
+}
+
+class FcvSepFilter2DLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvSepFilter2DLoop_Invoker(const uchar* _src_data, size_t _src_step, uchar* _dst_data, size_t _dst_step, const int _ddepth,
+        int _width, int _height, uchar* _kernelX, int _kernelXSize, uchar* _kernelY,int _kernelYSize) :
+        cv::ParallelLoopBody(), src_data(_src_data), src_step(_src_step), dst_data(_dst_data), dst_step(_dst_step), ddepth(_ddepth),
+        width(_width), height(_height), kernelX(_kernelX), kernelXSize(_kernelXSize), kernelY(_kernelY), kernelYSize(_kernelYSize)
+    {
+        half_ksize = kernelYSize/2;
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        int topLines    = 0;
+        int rangeHeight = range.end-range.start;
+
+        if(range.start >= half_ksize)
+        {
+            topLines  += half_ksize;
+            rangeHeight += half_ksize;
+        }
+
+        if(range.end <= height-half_ksize)
+        {
+            rangeHeight += half_ksize;
+        }
+
+        const uchar *src = src_data + (range.start - topLines) * src_step;
+        uchar dst[dst_step*rangeHeight];
+
+        switch (ddepth)
+        {
+            case CV_8U:
+            {
+                fcvFilterCorrSepMxNu8((int8_t*)kernelX, kernelXSize, (int8_t*)kernelY, kernelYSize, 0, src, width, rangeHeight,
+                    src_step, dst, dst_step);
+                break;
+            }
+            case CV_16S:
+            {
+                int16_t tmpImage[width*(rangeHeight+kernelXSize-1)];
+                switch (kernelXSize)
+                {
+                    case 9:
+                    {
+                        fcvFilterCorrSep9x9s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
+                            tmpImage, (int16_t*)dst, dst_step);
+                        break;
+                    }
+                    case 11:
+                    {
+                        fcvFilterCorrSep11x11s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
+                            tmpImage, (int16_t*)dst, dst_step);
+                        break;
+                    }
+                    case 13:
+                    {
+                        fcvFilterCorrSep13x13s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
+                            tmpImage, (int16_t*)dst, dst_step);
+                        break;
+                    }
+                    case 15:
+                    {
+                        fcvFilterCorrSep15x15s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
+                            tmpImage, (int16_t*)dst, dst_step);
+                        break;
+                    }
+                    case 17:
+                    {
+                        fcvFilterCorrSep17x17s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
+                            tmpImage, (int16_t*)dst, dst_step);
+                        break;
+                    }
+
+                    default:
+                    {
+                        fcvFilterCorrSepNxNs16((int16_t*)kernelX, kernelXSize, (int16_t*)src, width, rangeHeight, src_step,
+                            tmpImage, (int16_t*)dst, dst_step);
+                        break;
+                    }
+                }
+                break;
+            }
+            default:
+            {
+                CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(ddepth)));
+                break;
+            }
+        }
+
+        uchar* dptr = dst_data+range.start*dst_step;
+        uchar* sptr = dst+topLines*dst_step;
+        memcpy(dptr, sptr, (range.end - range.start) * dst_step);
+    }
+
+    private:
+    const uchar*    src_data;
+    const size_t    src_step;
+    uchar*          dst_data;
+    const size_t    dst_step;
+    const int       ddepth;
+    const int       width;
+    const int       height;
+    uchar*          kernelX;
+    const int       kernelXSize;
+    uchar*          kernelY;
+    const int       kernelYSize;
+    int             half_ksize;
+
+    FcvSepFilter2DLoop_Invoker(const FcvSepFilter2DLoop_Invoker &);  // = delete;
+    const FcvSepFilter2DLoop_Invoker& operator= (const FcvSepFilter2DLoop_Invoker &);  // = delete;
+};
+
+void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && (_src.type() == CV_8UC1 || _src.type() == CV_16SC1));
+    _dst.create(_src.size(), ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    Mat kernelX = _kernelX.getMat();
+    Mat kernelY = _kernelY.getMat();
+
+    int nStripes = src.rows / 80 == 0 ? 1 : src.rows / 80;
+    switch (ddepth)
+    {
+        case CV_8U:
+        {
+            cv::parallel_for_(cv::Range(0, src.rows),
+            FcvSepFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernelX.data,
+                kernelX.size().width, kernelY.data, kernelY.size().width),nStripes);
+            break;
+        }
+        case CV_16S:
+        {
+            CV_Assert(CV_MAT_DEPTH(src.type()) == CV_16S);
+            CV_Assert(kernelX.size() == kernelY.size());
+            // kernalX and kernelY shhould be same.
+            Mat diff;
+            absdiff(kernelX, kernelY, diff);
+            CV_Assert(countNonZero(diff) == 0);
+
+            cv::parallel_for_(cv::Range(0, src.rows),
+            FcvSepFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernelX.data,
+                kernelX.size().width, kernelY.data, kernelY.size().width),nStripes);
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(ddepth)));
+            break;
+        }
+    }
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/edges.cpp b/modules/fastcv/src/edges.cpp
new file mode 100644
index 00000000000..76bbc14e889
--- /dev/null
+++ b/modules/fastcv/src/edges.cpp
@@ -0,0 +1,121 @@
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void sobel3x3u8(cv::InputArray _src, cv::OutputArray _dst, cv::OutputArray _dsty, int ddepth, bool normalization)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Size size = _src.size();
+    _dst.create(size, ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    if (_dsty.needed())
+    {
+        _dsty.create(size, ddepth);
+        Mat dsty = _dsty.getMat();
+
+        switch(ddepth)
+        {
+            case CV_8S:
+                if (normalization)
+                    fcvImageGradientSobelPlanars8_v2(src.data, src.cols, src.rows, src.step, (int8_t*)dst.data,
+                        (int8_t*)dsty.data, dst.step);
+                else
+                    CV_Error(cv::Error::StsBadArg,
+                        cv::format("Depth: %d should do normalization, make sure the normalization parameter is true", ddepth));
+                break;
+            case CV_16S:
+                if (normalization)
+                    fcvImageGradientSobelPlanars16_v2(src.data, src.cols, src.rows, src.step, (int16_t*)dst.data,
+                        (int16_t*)dsty.data, dst.step);
+                else
+                    fcvImageGradientSobelPlanars16_v3(src.data, src.cols, src.rows, src.step, (int16_t*)dst.data,
+                        (int16_t*)dsty.data, dst.step);
+                break;
+            case CV_32F:
+                if (normalization)
+                    fcvImageGradientSobelPlanarf32_v2(src.data, src.cols, src.rows, src.step, (float32_t*)dst.data,
+                        (float32_t*)dsty.data, dst.step);
+                else
+                    fcvImageGradientSobelPlanarf32_v3(src.data, src.cols, src.rows, src.step, (float32_t*)dst.data,
+                        (float32_t*)dsty.data, dst.step);
+                break;
+            default:
+                CV_Error(cv::Error::StsBadArg, cv::format("depth: %d is not supported", ddepth));
+                break;
+        }
+    }
+    else
+    {
+        fcvFilterSobel3x3u8_v2(src.data, src.cols, src.rows, src.step, dst.data, dst.step);
+    }
+}
+
+void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArray _dy, int kernel_size, int borderType, int borderValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    Size size = _src.size();
+    _dx.create( size, CV_16SC1);
+    _dy.create( size, CV_16SC1);
+
+    Mat src = _src.getMat();
+    Mat dx = _dx.getMat();
+    Mat dy = _dy.getMat();
+    fcvStatus status = FASTCV_SUCCESS;
+
+    fcvBorderType   fcvBorder;
+
+    switch (borderType)
+    {
+        // For constant border, there are no border value, OpenCV default value is 0
+        case cv::BorderTypes::BORDER_CONSTANT:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_CONSTANT;
+            break;
+        }
+        case cv::BorderTypes::BORDER_REPLICATE:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_REPLICATE;
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Border type: %d is not supported", borderType));
+           break;
+        }
+    }
+
+    switch (kernel_size)
+    {
+        case 3:
+            status = fcvFilterSobel3x3u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        case 5:
+            status = fcvFilterSobel5x5u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        case 7:
+            status = fcvFilterSobel7x7u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        default:
+            CV_Error(cv::Error::StsBadArg, cv::format("Kernel size %d is not supported", kernel_size));
+            break;
+    }
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/warp.cpp b/modules/fastcv/src/warp.cpp
new file mode 100644
index 00000000000..09cfc09e1aa
--- /dev/null
+++ b/modules/fastcv/src/warp.cpp
@@ -0,0 +1,71 @@
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvWarpPerspectiveLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvWarpPerspectiveLoop_Invoker(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2, InputArray _M0,
+        Size _dsize) : cv::ParallelLoopBody()
+    {
+        src1 = _src1.getMat();
+        src2 = _src2.getMat();
+        dsize = _dsize;
+
+        _dst1.create(dsize, src1.type());
+        _dst2.create(dsize, src2.type());
+        dst1 = _dst1.getMat();
+        dst2 = _dst2.getMat();
+
+        M = _M0.getMat();
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        uchar* dst1_ptr = dst1.data + range.start*dst1.step;
+        uchar* dst2_ptr = dst2.data + range.start*dst2.step;
+        int rangeHeight = range.end - range.start;
+
+        float rangeMatrix[9];
+        rangeMatrix[0] = M.at<float>(0,0);
+        rangeMatrix[1] = M.at<float>(0,1);
+        rangeMatrix[2] = M.at<float>(0,2)+range.start*M.at<float>(0,1);
+        rangeMatrix[3] = M.at<float>(1,0);
+        rangeMatrix[4] = M.at<float>(1,1);
+        rangeMatrix[5] = M.at<float>(1,2)+range.start*M.at<float>(1,1);
+        rangeMatrix[6] = M.at<float>(2,0);
+        rangeMatrix[7] = M.at<float>(2,1);
+        rangeMatrix[8] = M.at<float>(2,2)+range.start*M.at<float>(2,1);
+
+        fcv2PlaneWarpPerspectiveu8(src1.data, src2.data, src1.cols, src1.rows, src1.step, src2.step, dst1_ptr, dst2_ptr,
+            dsize.width, rangeHeight, dst1.step, dst2.step, rangeMatrix);
+    }
+
+    private:
+    Mat         src1;
+    Mat         src2;
+    Mat         dst1;
+    Mat         dst2;
+    Mat         M;
+    Size        dsize;
+
+    FcvWarpPerspectiveLoop_Invoker(const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+    const FcvWarpPerspectiveLoop_Invoker& operator= (const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+};
+
+void warpPerspective2Plane(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2, InputArray _M0,
+        Size dsize)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src1.empty() && _src1.type() == CV_8UC1);
+    CV_Assert(!_src2.empty() && _src2.type() == CV_8UC1);
+    CV_Assert(!_M0.empty());
+
+    cv::parallel_for_(cv::Range(0, dsize.height),
+        FcvWarpPerspectiveLoop_Invoker(_src1, _src2, _dst1, _dst2, _M0, dsize), 1);
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_blur.cpp b/modules/fastcv/test/test_blur.cpp
new file mode 100644
index 00000000000..1dde0261f28
--- /dev/null
+++ b/modules/fastcv/test/test_blur.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<Size, int, int, bool>> GaussianBlurTest;
+
+TEST_P(GaussianBlurTest, accuracy)
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+    bool border = get<3>(GetParam());
+
+    // For some cases FastCV not support, so skip them
+    if((ksize!=5) && (depth!=CV_8U))
+        return;
+
+    cv::Mat src(srcSize, depth);
+    cv::Mat dst,ref;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    cv::fastcv::gaussianBlur(src, dst, ksize, border);
+
+    if(depth == CV_32S)
+        src.convertTo(src, CV_32F);
+    cv::GaussianBlur(src,ref,Size(ksize,ksize),0,0,border);
+    ref.convertTo(ref,depth);
+
+    cv::Mat difference;
+    cv::absdiff(dst, ref, difference);
+
+    int num_diff_pixels = cv::countNonZero(difference);
+
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+typedef testing::TestWithParam<tuple<Size, int, int>> Filter2DTest;
+
+TEST_P(Filter2DTest, accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth   = get<1>(GetParam());
+    int ksize    = get<2>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel;
+    cv::Mat dst, ref;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            kernel.create(ksize,ksize,CV_8S);
+            break;
+        }
+        case CV_32F:
+        {
+            kernel.create(ksize,ksize,CV_32F);
+            break;
+        }
+        default:
+            return;
+    }
+
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    cv::fastcv::filter2D(src, dst, ddepth, kernel);
+    cv::filter2D(src, ref, ddepth, kernel);
+
+    cv::Mat difference;
+    dst.convertTo(dst, CV_8U);
+    ref.convertTo(ref, CV_8U);
+    cv::absdiff(dst, ref, difference);
+
+    int num_diff_pixels = cv::countNonZero(difference);
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+typedef testing::TestWithParam<tuple<Size, int>> SepFilter2DTest;
+
+TEST_P(SepFilter2DTest, accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize    = get<1>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel(1,ksize,CV_8S);
+    cv::Mat dst,ref;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    cv::fastcv::sepFilter2D(src, dst, CV_8U, kernel, kernel);
+    cv::sepFilter2D(src,ref,CV_8U,kernel,kernel);
+
+    cv::Mat difference;
+    cv::absdiff(dst, ref, difference);
+    int num_diff_pixels = cv::countNonZero(difference);
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, GaussianBlurTest, Combine(
+/*image size*/     ::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*image depth*/    ::testing::Values(CV_8U,CV_16S,CV_32S),
+/*kernel size*/    ::testing::Values(3, 5),
+/*blur border*/    ::testing::Values(true,false)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Filter2DTest, Combine(
+/*image sie*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*dst depth*/      Values(CV_8U,CV_16S,CV_32F),
+/*kernel size*/    Values(3, 5, 7, 9, 11)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, SepFilter2DTest, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*kernel size*/    Values(3, 5, 7, 9, 11)
+));
+
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_edges.cpp b/modules/fastcv/test/test_edges.cpp
new file mode 100644
index 00000000000..e1e1576ef15
--- /dev/null
+++ b/modules/fastcv/test/test_edges.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<Size, int, int, int>> Sobel;
+typedef testing::TestWithParam<tuple<Size, int>> Sobel3x3u8;
+
+TEST_P(Sobel,accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize = get<1>(GetParam());
+    int border = get<2>(GetParam());
+    int borderValue = get<3>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U), refx, refy;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cv::fastcv::sobel(src, dx, dy, ksize, border, borderValue);
+
+    cv::Sobel(src, refx, CV_16S, 1, 0, ksize, 1.0, 0.0, border);
+    cv::Sobel(src, refy, CV_16S, 0, 1, ksize, 1.0, 0.0, border);
+
+    cv::Mat difference_x, difference_y;
+    cv::absdiff(dx, refx, difference_x);
+    cv::absdiff(dy, refy, difference_y);
+
+    int num_diff_pixels_x = cv::countNonZero(difference_x);
+    int num_diff_pixels_y = cv::countNonZero(difference_y);
+    EXPECT_LT(num_diff_pixels_x, src.size().area()*0.1);
+    EXPECT_LT(num_diff_pixels_y, src.size().area()*0.1);
+}
+
+TEST_P(Sobel3x3u8,accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U), refx, refy;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    cv::fastcv::sobel3x3u8(src, dx, dy, ddepth, 0);
+    cv::Sobel(src, refx, ddepth, 1, 0);
+    cv::Sobel(src, refy, ddepth, 0, 1);
+
+    cv::Mat difference_x, difference_y;
+    cv::absdiff(dx, refx, difference_x);
+    cv::absdiff(dy, refy, difference_y);
+
+    int num_diff_pixels_x = cv::countNonZero(difference_x);
+    int num_diff_pixels_y = cv::countNonZero(difference_y);
+    EXPECT_LT(num_diff_pixels_x, src.size().area()*0.1);
+    EXPECT_LT(num_diff_pixels_y, src.size().area()*0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Sobel, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*kernel size*/     Values(3,5,7),
+/*border*/          Values(BORDER_CONSTANT, BORDER_REPLICATE),
+/*border value*/    Values(0)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Sobel3x3u8, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*dst depth*/       Values(CV_16S, CV_32F)
+));
+
+}
+}
diff --git a/modules/fastcv/test/test_warp.cpp b/modules/fastcv/test/test_warp.cpp
new file mode 100644
index 00000000000..38ce2752f60
--- /dev/null
+++ b/modules/fastcv/test/test_warp.cpp
@@ -0,0 +1,68 @@
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<cv::Size> WarpPerspective2Plane;
+
+TEST_P(WarpPerspective2Plane, accuracy)
+{
+    cv::Size dstSize = GetParam();
+    cv::Mat img = imread(cvtest::findDataFile("cv/shared/baboon.png"));
+    Mat src(img.rows, img.cols, CV_8UC1);
+    cvtColor(img,src,cv::COLOR_BGR2GRAY);
+    cv::Mat dst1, dst2, mat, ref1, ref2;
+    mat.create(3,3,CV_32FC1);
+    dst1.create(dstSize,CV_8UC1);
+    dst2.create(dstSize,CV_8UC1);
+
+    RNG rng = RNG((uint64)-1);
+    Point2f s[4], d[4];
+
+    s[0] = Point2f(0,0);
+    d[0] = Point2f(0,0);
+    s[1] = Point2f(src.cols-1.f,0);
+    d[1] = Point2f(dst1.cols-1.f,0);
+    s[2] = Point2f(src.cols-1.f,src.rows-1.f);
+    d[2] = Point2f(dst1.cols-1.f,dst1.rows-1.f);
+    s[3] = Point2f(0,src.rows-1.f);
+    d[3] = Point2f(0,dst1.rows-1.f);
+
+    float buffer[16];
+    Mat tmp( 1, 16, CV_32FC1, buffer );
+    rng.fill( tmp, 1, Scalar::all(0.), Scalar::all(0.1) );
+
+    for(int i = 0; i < 4; i++ )
+    {
+        s[i].x += buffer[i*4]*src.cols/2;
+        s[i].y += buffer[i*4+1]*src.rows/2;
+        d[i].x += buffer[i*4+2]*dst1.cols/2;
+        d[i].y += buffer[i*4+3]*dst1.rows/2;
+    }
+
+    cv::getPerspectiveTransform( s, d ).convertTo( mat, mat.depth() );
+    // Invert the perspective matrix
+    invert(mat,mat);
+
+    cv::fastcv::warpPerspective2Plane(src, src, dst1, dst2, mat, dstSize);
+    cv::warpPerspective(src,ref1,mat,dstSize,(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP));
+    cv::warpPerspective(src,ref2,mat,dstSize,(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP));
+
+    cv::Mat difference1, difference2, mask1,mask2;
+    cv::absdiff(dst1, ref1, difference1);
+    cv::absdiff(dst2, ref2, difference2);
+    cv::threshold(difference1, mask1, 5, 255, cv::THRESH_BINARY);
+    cv::threshold(difference2, mask2, 5, 255, cv::THRESH_BINARY);
+    int num_diff_pixels_1 = cv::countNonZero(mask1);
+    int num_diff_pixels_2 = cv::countNonZero(mask2);
+    // imwrite(format("/tmp/image/warp_dst_%dx%d.png",dstSize.width,dstSize.height),dst1);
+    // imwrite(format("/tmp/image/warp_ref_%dx%d.png",dstSize.width,dstSize.height),ref1);
+    // imwrite(format("/tmp/image/warp_diff_%dx%d.png",dstSize.width,dstSize.height),difference_1);
+
+    EXPECT_LT(num_diff_pixels_1, src.size().area()*0.02);
+    EXPECT_LT(num_diff_pixels_2, src.size().area()*0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, WarpPerspective2Plane, Values(perf::szVGA, perf::sz720p, perf::sz1080p));
+
+}
+}
\ No newline at end of file

From 1c35506b2ad68a93313047616489516435063b7a Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Sun, 8 Dec 2024 18:05:25 +0530
Subject: [PATCH 02/11] Add extension APIs from Rostislav Vasilikhin

---
 modules/fastcv/include/opencv2/fastcv.hpp     |   3 +
 .../fastcv/include/opencv2/fastcv/cluster.hpp |   2 +-
 .../fastcv/include/opencv2/fastcv/hough.hpp   |  23 +-
 .../include/opencv2/fastcv/ipptransform.hpp   |  38 +++
 .../fastcv/include/opencv2/fastcv/moments.hpp |   5 +-
 .../fastcv/include/opencv2/fastcv/mser.hpp    | 181 ++++++------
 .../fastcv/include/opencv2/fastcv/pyramid.hpp |  50 ++++
 .../include/opencv2/fastcv/tracking.hpp       |  65 +++++
 modules/fastcv/perf/perf_bilateral.cpp        |  21 +-
 modules/fastcv/perf/perf_fft_dct.cpp          | 113 ++++++++
 modules/fastcv/perf/perf_hough.cpp            |  47 ++++
 modules/fastcv/perf/perf_mser.cpp             |  15 +-
 modules/fastcv/perf/perf_pyramid.cpp          |  76 +++++
 modules/fastcv/perf/perf_tracking.cpp         |  98 +++++++
 modules/fastcv/src/bilateralFilter.cpp        |  78 +++--
 modules/fastcv/src/hough.cpp                  |  33 +++
 modules/fastcv/src/ipptransform.cpp           |  48 ++++
 modules/fastcv/src/moments.cpp                |  46 ++-
 modules/fastcv/src/mser.cpp                   | 145 +++++++---
 modules/fastcv/src/precomp.hpp                |   3 +
 modules/fastcv/src/pyramid.cpp                | 180 ++++++++++++
 modules/fastcv/src/remap.cpp                  |   2 +-
 modules/fastcv/src/tracking.cpp               | 266 ++++++++++++++++++
 modules/fastcv/test/test_bilateral.cpp        |  12 +-
 modules/fastcv/test/test_fft.cpp              |   1 -
 modules/fastcv/test/test_hough.cpp            |  93 ++++++
 modules/fastcv/test/test_ipptransform.cpp     |  80 ++++++
 modules/fastcv/test/test_moments.cpp          |  20 +-
 modules/fastcv/test/test_mser.cpp             |  14 +-
 modules/fastcv/test/test_precomp.hpp          |   1 +
 modules/fastcv/test/test_pyramid.cpp          | 171 +++++++++++
 modules/fastcv/test/test_remap.cpp            |  15 +-
 modules/fastcv/test/test_scale.cpp            |  23 +-
 modules/fastcv/test/test_tracking.cpp         | 142 ++++++++++
 34 files changed, 1833 insertions(+), 277 deletions(-)
 create mode 100644 modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/pyramid.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/tracking.hpp
 create mode 100644 modules/fastcv/perf/perf_fft_dct.cpp
 create mode 100644 modules/fastcv/perf/perf_pyramid.cpp
 create mode 100644 modules/fastcv/perf/perf_tracking.cpp
 create mode 100644 modules/fastcv/src/ipptransform.cpp
 create mode 100644 modules/fastcv/src/pyramid.cpp
 create mode 100644 modules/fastcv/src/tracking.cpp
 create mode 100644 modules/fastcv/test/test_ipptransform.cpp
 create mode 100644 modules/fastcv/test/test_pyramid.cpp
 create mode 100644 modules/fastcv/test/test_tracking.cpp

diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
index a2129ffbb86..6ed8eba4a33 100644
--- a/modules/fastcv/include/opencv2/fastcv.hpp
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -17,13 +17,16 @@
 #include "opencv2/fastcv/fast10.hpp"
 #include "opencv2/fastcv/fft.hpp"
 #include "opencv2/fastcv/hough.hpp"
+#include "opencv2/fastcv/ipptransform.hpp"
 #include "opencv2/fastcv/moments.hpp"
 #include "opencv2/fastcv/mser.hpp"
+#include "opencv2/fastcv/pyramid.hpp"
 #include "opencv2/fastcv/remap.hpp"
 #include "opencv2/fastcv/scale.hpp"
 #include "opencv2/fastcv/shift.hpp"
 #include "opencv2/fastcv/smooth.hpp"
 #include "opencv2/fastcv/thresh.hpp"
+#include "opencv2/fastcv/tracking.hpp"
 #include "opencv2/fastcv/warp.hpp"
 
 /**
diff --git a/modules/fastcv/include/opencv2/fastcv/cluster.hpp b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
index f90deeae465..65f4540862e 100644
--- a/modules/fastcv/include/opencv2/fastcv/cluster.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
@@ -16,7 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Clusterizes N input points in D-dimensional space into K clusters
- * 
+ *
  * @param points            Points array of type 8u, each row represets a point.
  *                          Size is N rows by D columns, can be non-continuous.
  * @param clusterCenters    Initial cluster centers array of type 32f, each row represents a center.
diff --git a/modules/fastcv/include/opencv2/fastcv/hough.hpp b/modules/fastcv/include/opencv2/fastcv/hough.hpp
index 74f78a10841..e43323903cb 100644
--- a/modules/fastcv/include/opencv2/fastcv/hough.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/hough.hpp
@@ -16,7 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Performs Hough Line detection
- * 
+ *
  * @param src Input 8-bit image containing binary contour. Width and step should be divisible by 8
  * @param lines Output array containing detected lines in a form of (x1, y1, x2, y2) where all numbers are 32-bit floats
  * @param threshold Controls the minimal length of a detected line. Value must be between 0.0 and 1.0
@@ -25,6 +25,27 @@ namespace fastcv {
  */
 CV_EXPORTS_W void houghLines(InputArray src, OutputArray lines, double threshold = 0.25);
 
+
+/**
+ * @brief Finds circles in a grayscale image using Hough transform.
+ *        The radius of circle varies from 0 to max(srcWidth, srcHeight).
+ *
+ * @param src Input 8-bit image containing binary contour. Step should be divisible by 8, data start should be 128-bit aligned
+ * @param circles Output array containing detected circles in a form (x, y, r) where all numbers are 32-bit integers
+ * @param minDist Minimum distance between the centers of the detected circles
+ * @param cannyThreshold The higher threshold of the two passed to the Canny() edge detector
+ *                       (the lower one is twice smaller). Default is 100.
+ * @param accThreshold The accumulator threshold for the circle centers at the detection
+ *                     stage. The smaller it is, the more false circles may be detected.
+ *                     Circles, corresponding to the larger accumulator values, will be
+ *                     returned first. Default is 100.
+ * @param minRadius Minimum circle radius, default is 0
+ * @param maxRadius Maximum circle radius, default is 0
+ */
+CV_EXPORTS_W void houghCircles(InputArray src, OutputArray circles, uint32_t minDist,
+                               uint32_t cannyThreshold = 100, uint32_t accThreshold = 100,
+                               uint32_t minRadius = 0, uint32_t maxRadius = 0);
+
 //! @}
 
 } // fastcv::
diff --git a/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
new file mode 100644
index 00000000000..cba87d69af7
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_IPPTRANSFORM_HPP
+#define OPENCV_FASTCV_IPPTRANSFORM_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief This function performs 8x8 forward discrete Cosine transform on input image
+ * 
+ * @param src Input image of type CV_8UC1
+ * @param dst Output image of type CV_16SC1
+ */
+CV_EXPORTS_W void DCT(InputArray src, OutputArray dst);
+
+/**
+ * @brief This function performs 8x8 inverse discrete Cosine transform on input image
+ *
+ * @param src Input image of type CV_16SC1
+ * @param dst Output image of type CV_8UC1
+ */
+CV_EXPORTS_W void IDCT(InputArray src, OutputArray dst);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_IPPTRANSFORM_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/moments.hpp b/modules/fastcv/include/opencv2/fastcv/moments.hpp
index 3cffa62f767..90034548571 100644
--- a/modules/fastcv/include/opencv2/fastcv/moments.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/moments.hpp
@@ -17,8 +17,9 @@ namespace fastcv {
 /**
  * @brief Calculates all of the moments up to the third order of the image pixels' intensities
           The results are returned in the structure cv::Moments.
- * @param _src Input image with type CV_8UC1, CV_32SC1, CV_32FC1
- * @param binary If 1, binary image (0x00-black, oxff-white); if 0, grayscale image
+ * @param _src      Input image with type CV_8UC1, CV_32SC1, CV_32FC1
+ * @param binary    If true, assumes the image to be binary (0x00 for black, 0xff for white), otherwise assumes the image to be
+ *                  grayscale.
  */
 CV_EXPORTS cv::Moments moments(InputArray _src, bool binary);
 
diff --git a/modules/fastcv/include/opencv2/fastcv/mser.hpp b/modules/fastcv/include/opencv2/fastcv/mser.hpp
index 78282b66fdd..08b751fe81d 100644
--- a/modules/fastcv/include/opencv2/fastcv/mser.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/mser.hpp
@@ -15,107 +15,98 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Structure containing additional information about found contour
+ * @brief MSER blob detector for grayscale images
  *
  */
-struct ContourData
+class CV_EXPORTS_W MSER
 {
-    uint32_t variation;   //!< Variation of a contour from previous grey level
-    int32_t  polarity;    //!< Polarity for a contour. This value is 1 if this is a MSER+ region, -1 if this is a MSER- region.
-    uint32_t nodeId;      //!< Node ID for a contour
-    uint32_t nodeCounter; //!< Node counter for a contour
-};
+public:
 
-/**
- * @brief This is an overload for MSER() function
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief Structure containing additional information about found contour
+     *
+     */
+    struct ContourData
+    {
+        uint32_t variation;   //!< Variation of a contour from previous grey level
+        int32_t  polarity;    //!< Polarity for a contour. This value is 1 if this is a MSER+ region, -1 if this is a MSER- region.
+        uint32_t nodeId;      //!< Node ID for a contour
+        uint32_t nodeCounter; //!< Node counter for a contour
+    };
 
-/**
- * @brief This is an overload for MSER() function
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param boundingBoxes Array containing bounding boxes of found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief Creates MSER detector
+     *
+     * @param imgSize Image size. Image width has to be greater than 50, and image height has to be greater than 5.
+     * @param numNeighbors Number of neighbors in contours, can be 4 or 8
+     * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
+                    within which the region is stable ).
+                    Typical value range [0.8 8], typical value 2
+     * @param minArea Minimum area (number of pixels) of a mser contour.
+                      Typical value range [10 50], typical value 30
+     * @param maxArea Maximum area (number of pixels) of a  mser contour.
+                      Typical value 14400 or 0.25*width*height
+     * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
+                           Typical value range [0.1 1.0], typical value 0.15
+     * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
+                           Typical value range [0.1 1.0], typical value 0.2
+     * @return Feature detector object ready for detection
+     */
+    CV_WRAP static Ptr<MSER> create(cv::Size     imgSize,
+                                    unsigned int numNeighbors = 4,
+                                    unsigned int delta = 2,
+                                    unsigned int minArea = 30,
+                                    unsigned int maxArea = 14400,
+                                    float        maxVariation = 0.15f,
+                                    float        minDiversity = 0.2f);
 
-/**
- * @brief Runs MSER blob detector on the grayscale image
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param boundingBoxes Array containing bounding boxes of found contours
- * @param contourData Array containing additional information about found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                       std::vector<ContourData>& contourData,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief This is an overload for detect() function
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    */
+    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours) = 0;
+
+    /**
+     * @brief This is an overload for detect() function
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    * @param boundingBoxes Array containing bounding boxes of found contours
+    */
+    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) = 0;
+
+    /**
+     * @brief Runs MSER blob detector on the grayscale image
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    * @param boundingBoxes Array containing bounding boxes of found contours
+    * @param contourData Array containing additional information about found contours
+    */
+    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                                std::vector<ContourData>& contourData) = 0;
+
+    CV_WRAP virtual cv::Size     getImgSize()      = 0;
+    CV_WRAP virtual unsigned int getNumNeighbors() = 0;
+    CV_WRAP virtual unsigned int getDelta()        = 0;
+    CV_WRAP virtual unsigned int getMinArea()      = 0;
+    CV_WRAP virtual unsigned int getMaxArea()      = 0;
+    CV_WRAP virtual float        getMaxVariation() = 0;
+    CV_WRAP virtual float        getMinDiversity() = 0;
+
+    virtual ~MSER() {}
+};
 
 //! @}
 
diff --git a/modules/fastcv/include/opencv2/fastcv/pyramid.hpp b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
new file mode 100644
index 00000000000..2e7a89e98f6
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_PYRAMID_HPP
+#define OPENCV_FASTCV_PYRAMID_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Creates a gradient pyramid from an image pyramid
+ *
+ * @param pyr Input pyramid of 1-channel 8-bit images. Only continuous images are supported.
+ * @param dx Horizontal Sobel gradient pyramid of the same size as pyr
+ * @param dy Verical Sobel gradient pyramid of the same size as pyr
+ * @param outType Type of output data, can be CV_8S, CV_16S or CV_32F
+ */
+CV_EXPORTS_W void sobelPyramid(InputArrayOfArrays pyr, OutputArrayOfArrays dx, OutputArrayOfArrays dy, int outType = CV_8S);
+
+/**
+ * @brief Builds an image pyramid of float32 arising from a single
+    original image - that are successively downscaled w.r.t. the
+    pre-set levels.
+ *
+ * @param src Input single-channel image of type 8U or 32F
+ * @param pyr Output array containing nLevels downscaled image copies
+ * @param nLevels Number of pyramid levels to produce
+ * @param scaleBy2 to scale images 2x down or by a factor of 1/(2)^(1/4) which is approximated as 0.8408964 (ORB downscaling),
+ *                 ORB scaling is not supported for float point images
+ * @param borderType how to process border, the options are BORDER_REFLECT (maps to FASTCV_BORDER_REFLECT),
+ *                   BORDER_REFLECT_101 (maps to FASTCV_BORDER_REFLECT_V2) and BORDER_REPLICATE (maps to FASTCV_BORDER_REPLICATE).
+ *                   Other border types are mapped to FASTCV_BORDER_UNDEFINED. Ignored for float point images
+ * @param borderValue what value should be used to fill border, ignored for float point images
+ */
+CV_EXPORTS_W void buildPyramid(InputArray src, OutputArrayOfArrays pyr, int nLevels, bool scaleBy2 = true,
+                               int borderType = cv::BORDER_REFLECT, uint8_t borderValue = 0);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_PYRAMID_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/tracking.hpp b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
new file mode 100644
index 00000000000..95b9ab2466e
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_TRACKING_HPP
+#define OPENCV_FASTCV_TRACKING_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Calculates sparse optical flow using Lucas-Kanade algorithm
+ *
+ * @param src Input single-channel image of type 8U, initial motion frame
+ * @param dst Input single-channel image of type 8U, final motion frame, should have the same size and stride as initial frame
+ * @param srcPyr Pyramid built from intial motion frame
+ * @param dstPyr Pyramid built from final motion frame
+ * @param ptsIn Array of initial subpixel coordinates of starting points, should contain 32F 2D elements
+ * @param ptsOut Output array of calculated final points, should contain 32F 2D elements
+ * @param ptsEst Input array of estimations for final points, should contain 32F 2D elements, can be empty
+ * @param statusVec Output array of int32 values indicating status of each feature, can be empty
+ * @param winSize Size of window for optical flow searching. Width and height ust be odd numbers. Suggested values are 5, 7 or 9
+ * @param termCriteria Termination criteria containing max number of iterations, max epsilon and stop condition
+ */
+void trackOpticalFlowLK(InputArray src, InputArray dst,
+                        InputArrayOfArrays srcPyr, InputArrayOfArrays dstPyr,
+                        InputArray ptsIn, OutputArray ptsOut, InputArray ptsEst,
+                        OutputArray statusVec, cv::Size winSize = {7, 7},
+                        cv::TermCriteria termCriteria = {cv::TermCriteria::MAX_ITER | cv::TermCriteria::EPS,
+                                                         /* maxIterations */ 7,
+                                                         /* maxEpsilon */ 0.03f * 0.03f});
+
+/**
+ * @brief Overload for v1 of the LK tracking function
+ *
+ * @param src Input single-channel image of type 8U, initial motion frame
+ * @param dst Input single-channel image of type 8U, final motion frame, should have the same size and stride as initial frame
+ * @param srcPyr Pyramid built from intial motion frame
+ * @param dstPyr Pyramid built from final motion frame
+ * @param srcDxPyr Pyramid of Sobel derivative by X of srcPyr
+ * @param srcDyPyr Pyramid of Sobel derivative by Y of srcPyr
+ * @param ptsIn Array of initial subpixel coordinates of starting points, should contain 32F 2D elements
+ * @param ptsOut Output array of calculated final points, should contain 32F 2D elements
+ * @param statusVec Output array of int32 values indicating status of each feature, can be empty
+ * @param winSize Size of window for optical flow searching. Width and height ust be odd numbers. Suggested values are 5, 7 or 9
+ * @param maxIterations Maximum number of iterations to try
+ */
+void trackOpticalFlowLK(InputArray src, InputArray dst,
+                        InputArrayOfArrays srcPyr, InputArrayOfArrays dstPyr,
+                        InputArrayOfArrays srcDxPyr, InputArrayOfArrays srcDyPyr,
+                        InputArray ptsIn, OutputArray ptsOut,
+                        OutputArray statusVec, cv::Size winSize = {7, 7}, int maxIterations = 7);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_TRACKING_HPP
diff --git a/modules/fastcv/perf/perf_bilateral.cpp b/modules/fastcv/perf/perf_bilateral.cpp
index bb985da391d..63323d459cc 100644
--- a/modules/fastcv/perf/perf_bilateral.cpp
+++ b/modules/fastcv/perf/perf_bilateral.cpp
@@ -7,10 +7,10 @@
 
 namespace opencv_test {
 
-typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/> BilateralPerfParams;
-typedef perf::TestBaseWithParam<BilateralPerfParams> BilateralPerfTest;
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/> BilateralRecursivePerfParams;
+typedef perf::TestBaseWithParam<BilateralRecursivePerfParams> BilateralRecursivePerfTest;
 
-PERF_TEST_P(BilateralPerfTest, run,
+PERF_TEST_P(BilateralRecursivePerfTest, run,
     ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
                        ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f))
            )
@@ -32,14 +32,15 @@ PERF_TEST_P(BilateralPerfTest, run,
     SANITY_CHECK_NOTHING();
 }
 
-typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/, cv::Size, int > BilateralPerfParams2;
-typedef perf::TestBaseWithParam<BilateralPerfParams2> BilateralPerfTest2;
 
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/, cv::Size, int > BilateralPerfParams;
+typedef perf::TestBaseWithParam<BilateralPerfParams> BilateralPerfTest;
 
-PERF_TEST_P(BilateralPerfTest2, run,
+
+PERF_TEST_P(BilateralPerfTest, run,
     ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
                        ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f),
-					   ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
+                       ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
                        ::testing::Values(5, 7, 9))
            )
 {
@@ -47,17 +48,17 @@ PERF_TEST_P(BilateralPerfTest2, run,
     float sigmaColor = std::get<0>(p);
     float sigmaSpace = std::get<1>(p);
     cv::Size size  = std::get<2>(p);
-	int d = get<3>(p);
+    int d = get<3>(p);
 
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
     Mat dst;
 
-    for (;  next(); )
+    while (next())
     {
         startTimer();
-		cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+        cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
         stopTimer();
     }
 
diff --git a/modules/fastcv/perf/perf_fft_dct.cpp b/modules/fastcv/perf/perf_fft_dct.cpp
new file mode 100644
index 00000000000..829d2aaa766
--- /dev/null
+++ b/modules/fastcv/perf/perf_fft_dct.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<cv::Size> FFTExtPerfTest;
+
+PERF_TEST_P_(FFTExtPerfTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+    Mat dst;
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::FFT(src, dst);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FFTExtPerfTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat fwd, back;
+    cv::fastcv::FFT(src, fwd);
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::IFFT(fwd, back);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, FFTExtPerfTest,
+    ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512),
+                      Size(32, 1), Size(512, 1)));
+
+/// DCT ///
+
+typedef perf::TestBaseWithParam<cv::Size> DCTExtPerfTest;
+
+PERF_TEST_P_(DCTExtPerfTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+    Mat dst, ref;
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::DCT(src, dst);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(DCTExtPerfTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat fwd, back;
+    cv::fastcv::DCT(src, fwd);
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::IDCT(fwd, back);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, DCTExtPerfTest,
+    ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512)));
+} // namespace
diff --git a/modules/fastcv/perf/perf_hough.cpp b/modules/fastcv/perf/perf_hough.cpp
index 78424a696dc..53194d3100a 100644
--- a/modules/fastcv/perf/perf_hough.cpp
+++ b/modules/fastcv/perf/perf_hough.cpp
@@ -41,4 +41,51 @@ PERF_TEST_P(HoughLinesPerfTest, run,
     SANITY_CHECK_NOTHING();
 }
 
+
+typedef std::tuple<std::string /* file name */, uint32_t /* minDist */,   uint32_t /* cannyThreshold */,
+                   uint32_t /* accThreshold */, uint32_t /* minRadius */, uint32_t /* maxRadius */> HoughCirclesPerfTestParams;
+typedef ::perf::TestBaseWithParam<HoughCirclesPerfTestParams> HoughCirclesPerf;
+
+// NOTE: test files should be manually loaded to folder on a device, for example like this:
+// adb push fastcv/misc/hough/ /sdcard/testdata/fastcv/hough/
+
+PERF_TEST_P(HoughCirclesPerf, run,
+                ::testing::Values(
+                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 10, 100 },
+                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 30, 100 },
+                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 50, 100 },
+                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 10, 100 },
+                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 30, 100 },
+                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 50, 100 }
+                         )
+           )
+{
+    auto p = GetParam();
+    std::string fname       = std::get<0>(p);
+    uint32_t minDist        = std::get<1>(p);
+    uint32_t cannyThreshold = std::get<2>(p);
+    uint32_t accThreshold   = std::get<3>(p);
+    uint32_t minRadius      = std::get<4>(p);
+    uint32_t maxRadius      = std::get<5>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile(fname), cv::IMREAD_GRAYSCALE);
+    // make it aligned by 8
+    cv::Mat withBorder;
+    int bpix = ((src.cols & 0xfffffff8) + 8) - src.cols;
+    cv::copyMakeBorder(src, withBorder, 0, 0, 0, bpix, BORDER_REFLECT101);
+    src = withBorder;
+
+    while(next())
+    {
+        Mat icircles;
+        startTimer();
+        cv::fastcv::houghCircles(src, icircles, minDist,
+                                 cannyThreshold, accThreshold,
+                                 minRadius, maxRadius);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
diff --git a/modules/fastcv/perf/perf_mser.cpp b/modules/fastcv/perf/perf_mser.cpp
index 4e1a6ce80af..11787f4c99e 100644
--- a/modules/fastcv/perf/perf_mser.cpp
+++ b/modules/fastcv/perf/perf_mser.cpp
@@ -36,30 +36,31 @@ PERF_TEST_P(MSERPerfTest, run,
     float        maxVariation = 0.15f;
     float        minDiversity = 0.2f;
 
+    cv::Ptr<cv::fastcv::MSER> mser;
+    mser = cv::fastcv::MSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+                                    maxVariation, minDiversity);
+
     while(next())
     {
         std::vector<std::vector<Point>> contours;
         std::vector<cv::Rect> bboxes;
-        std::vector<cv::fastcv::ContourData> contourData;
+        std::vector<cv::fastcv::MSER::ContourData> contourData;
 
         startTimer();
         if (useBboxes)
         {
             if (useContourData)
             {
-                cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
-                                 delta, minArea, maxArea, maxVariation, minDiversity);
+                mser->detect(src, contours, bboxes, contourData);
             }
             else
             {
-                cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
-                                 delta, minArea, maxArea, maxVariation, minDiversity);
+                mser->detect(src, contours, bboxes);
             }
         }
         else
         {
-            cv::fastcv::MSER(src, contours, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours);
         }
         stopTimer();
     }
diff --git a/modules/fastcv/perf/perf_pyramid.cpp b/modules/fastcv/perf/perf_pyramid.cpp
new file mode 100644
index 00000000000..27c0fae8d59
--- /dev/null
+++ b/modules/fastcv/perf/perf_pyramid.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<bool /*useFloat*/, int /*nLevels*/, bool /*scaleBy2*/> PyramidTestParams;
+class PyramidTest : public ::perf::TestBaseWithParam<PyramidTestParams> { };
+
+PERF_TEST_P(PyramidTest, checkAllVersions, // version, useFloat, nLevels
+                        ::testing::Values(
+                            PyramidTestParams { true, 2,  true}, PyramidTestParams { true, 3,  true}, PyramidTestParams { true, 4,  true},
+                            PyramidTestParams {false, 2,  true}, PyramidTestParams {false, 3,  true}, PyramidTestParams {false, 4,  true},
+                            PyramidTestParams {false, 2, false}, PyramidTestParams {false, 3, false}, PyramidTestParams {false, 4, false}
+                            ))
+{
+    auto par = GetParam();
+
+    bool useFloat = std::get<0>(par);
+    int  nLevels  = std::get<1>(par);
+    bool scaleBy2 = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    if (useFloat)
+    {
+        cv::Mat f;
+        src.convertTo(f, CV_32F);
+        src = f;
+    }
+
+    while(next())
+    {
+        std::vector<cv::Mat> pyr;
+        startTimer();
+        cv::fastcv::buildPyramid(src, pyr, nLevels, scaleBy2);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+
+typedef std::tuple<MatType, size_t> SobelPyramidTestParams;
+class SobelPyramidTest : public ::perf::TestBaseWithParam<SobelPyramidTestParams> {};
+
+PERF_TEST_P(SobelPyramidTest, checkAllTypes,
+    ::testing::Combine(::testing::Values(CV_8S, CV_16S, CV_32F),
+                       ::testing::Values(3, 6)))
+{
+    auto p = GetParam();
+    int    type    = std::get<0>(p);
+    size_t nLevels = std::get<1>(p);
+
+    // NOTE: test files should be manually loaded to folder on a device, for example like this:
+    // adb push fastcv/misc/bilateral_recursive/ /sdcard/testdata/fastcv/bilateral/
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels);
+
+    while(next())
+    {
+        std::vector<cv::Mat> pyrDx, pyrDy;
+        startTimer();
+        cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_tracking.cpp b/modules/fastcv/perf/perf_tracking.cpp
new file mode 100644
index 00000000000..fc5d10eccdf
--- /dev/null
+++ b/modules/fastcv/perf/perf_tracking.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<int /*winSize*/, bool /*useSobelPyramid*/, bool /*useInitialEstimate*/ > TrackingTestParams;
+class TrackingTest : public ::perf::TestBaseWithParam<TrackingTestParams> {};
+
+PERF_TEST_P(TrackingTest, checkAllVersions,
+    ::testing::Combine(::testing::Values(5, 7, 9), // window size
+                       ::testing::Bool(),          // useSobelPyramid
+                       ::testing::Bool()           // useInitialEstimate
+                      ))
+{
+    auto par = GetParam();
+
+    int winSz               = std::get<0>(par);
+    bool useSobelPyramid    = std::get<1>(par);
+    bool useInitialEstimate = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    double ang = 5.0 * CV_PI / 180.0;
+    cv::Matx33d tr = {
+        cos(ang), -sin(ang), 1,
+        sin(ang),  cos(ang), 2,
+               0,         0, 1
+    };
+    cv::Matx33d orig {
+        1, 0, -(double)src.cols / 2,
+        0, 1, -(double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx33d back {
+        1, 0, (double)src.cols / 2,
+        0, 1, (double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx23d trans = (back * tr * orig).get_minor<2, 3>(0, 0);
+
+    cv::Mat dst;
+    cv::warpAffine(src, dst, trans, src.size());
+
+    int nLevels = 4;
+    std::vector<cv::Mat> srcPyr, dstPyr;
+
+    cv::buildPyramid(src, srcPyr, nLevels - 1);
+    cv::buildPyramid(dst, dstPyr, nLevels - 1);
+
+    cv::Matx23f transf = trans;
+    int nPts = 32;
+    std::vector<cv::Point2f> ptsIn, ptsEst, ptsExpected;
+    for (int i = 0; i < nPts; i++)
+    {
+        cv::Point2f p { (((float)cv::theRNG())*0.5f + 0.25f) * src.cols,
+                        (((float)cv::theRNG())*0.5f + 0.25f) * src.rows };
+        ptsIn.push_back(p);
+        ptsExpected.push_back(transf * cv::Vec3f(p.x, p.y, 1.0));
+        ptsEst.push_back(p);
+    }
+
+    cv::TermCriteria termCrit;
+    termCrit.type = cv::TermCriteria::COUNT | cv::TermCriteria::EPS;
+    termCrit.maxCount = 7;
+    termCrit.epsilon = 0.03f * 0.03f;
+
+    std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+    if (useSobelPyramid)
+    {
+        cv::fastcv::sobelPyramid(srcPyr, srcDxPyr, srcDyPyr, CV_8S);
+    }
+
+    while(next())
+    {
+        std::vector<int32_t> statusVec(nPts);
+        std::vector<cv::Point2f> ptsOut(nPts);
+        startTimer();
+        if (useSobelPyramid)
+        {
+            cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, srcDxPyr, srcDyPyr,
+                                           ptsIn, ptsOut, statusVec, {winSz, winSz});
+        }
+        else
+        {
+            cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, ptsIn, ptsOut, (useInitialEstimate ? ptsEst : noArray()),
+                                           statusVec, {winSz, winSz}, termCrit);
+        }
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/src/bilateralFilter.cpp b/modules/fastcv/src/bilateralFilter.cpp
index 1cd0ece6b14..a0995347b24 100644
--- a/modules/fastcv/src/bilateralFilter.cpp
+++ b/modules/fastcv/src/bilateralFilter.cpp
@@ -12,54 +12,45 @@ class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
 {
 public:
 
-    FcvFilterLoop_Invoker(cv::Mat src_, size_t src_step_, cv::Mat dst_, size_t dst_step_, int width_, int height_,  int bdr_, int knl_, float32_t sigma_color_, float32_t sigma_space_) :
+    FcvFilterLoop_Invoker(cv::Mat src_, size_t src_step_, cv::Mat dst_, size_t dst_step_, int width_, int height_,
+                          int bdr_, int knl_, float32_t sigma_color_, float32_t sigma_space_) :
         cv::ParallelLoopBody(), src_step(src_step_), dst_step(dst_step_), width(width_), height(height_),
         bdr(bdr_), knl(knl_), sigma_color(sigma_color_), sigma_space(sigma_space_), src(src_), dst(dst_)
-    {
-    }
+    { }
 
     virtual void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-
-        fcvStatus status = FASTCV_SUCCESS;
-		int height_ = range.end - range.start;
+        int height_ = range.end - range.start;
         int width_  = width;
 		cv::Mat src_;
 		int n = knl/2;
 
-		if(range.start == 0 && range.end == height)
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
-		}
-		else if(range.start == 0)
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
-		}
-		else if(range.end == (height))
+        src_ = cv::Mat(height_ + 2 * n, width_ + 2 * n, CV_8U);
+        if (range.start == 0 && range.end == height)
         {
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
-		}
-		else
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2*n)), src_, 0, 0, n, n, bdr);
-		}
-
+            cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
+        }
+        else if (range.start == 0)
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
+        }
+        else if (range.end == (height))
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
+        }
+        else
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2 * n)), src_, 0, 0, n, n, bdr);
+        }
 
 		cv::Mat dst_padded = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
 
-		if(knl == 5)
-		    status = fcvBilateralFilter5x5u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
-		else if(knl == 7)
-		    status = fcvBilateralFilter7x7u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
-		else if(knl == 9)
-		    status = fcvBilateralFilter9x9u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
+        auto func = (knl == 5) ? fcvBilateralFilter5x5u8_v3 :
+                    (knl == 7) ? fcvBilateralFilter7x7u8_v3 :
+                    (knl == 9) ? fcvBilateralFilter9x9u8_v3 :
+                    nullptr;
+        func(src_.data, width_ + 2 * n, height_ + 2 * n, width_ + 2 * n,
+             dst_padded.data, width_ + 2 * n, sigma_color, sigma_space, 0);
 
 		cv::Mat dst_temp1 = dst_padded(cv::Rect(n, n, width_, height_));
 		cv::Mat dst_temp2 = dst(cv::Rect(0, range.start, width_, height_));
@@ -97,20 +88,21 @@ void bilateralFilter( InputArray _src, OutputArray _dst, int d,
     Size size = _src.size();
 	_dst.create( size, type );
     Mat src = _src.getMat();
-	Mat dst = _dst.getMat();
+    Mat dst = _dst.getMat();
+
+    CV_Assert(src.data != dst.data);
 
     if( sigmaColor <= 0 )
+	{
         sigmaColor = 1;
+	}
     if( sigmaSpace <= 0 )
+	{
         sigmaSpace = 1;
+	}
 
-	int nStripes = 1;
-	if(src.rows/20 == 0)
-		nStripes = 1;
-	else
-		nStripes = (src.rows/20);
-
-	cv::parallel_for_(cv::Range(0, src.rows),
+    int nStripes = (src.rows / 20 == 0) ? 1 : (src.rows / 20);
+    cv::parallel_for_(cv::Range(0, src.rows),
               FcvFilterLoop_Invoker(src, src.step, dst, dst.step, src.cols, src.rows, borderType, d, sigmaColor, sigmaSpace), nStripes);
 }
 
diff --git a/modules/fastcv/src/hough.cpp b/modules/fastcv/src/hough.cpp
index 248f6b3517a..e46f64ac3f1 100644
--- a/modules/fastcv/src/hough.cpp
+++ b/modules/fastcv/src/hough.cpp
@@ -31,5 +31,38 @@ void houghLines(InputArray _src, OutputArray _lines, double threshold)
     lines(Range::all(), Range(0, nLines)).copyTo(_lines);
 }
 
+
+void houghCircles(InputArray _src, OutputArray _circles, uint32_t minDist,
+                  uint32_t cannyThreshold, uint32_t accThreshold,
+                  uint32_t minRadius, uint32_t maxRadius)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    CV_Assert((size_t)(src.data) % 16 == 0);
+
+    const uint32_t maxCircles = 16384;
+
+    Mat circles(1, maxCircles, CV_32SC3);
+
+    uint32_t nCircles = maxCircles;
+
+    AutoBuffer<uint8_t> tempBuf;
+    tempBuf.allocate(16 * src.step * src.rows);
+
+    CV_Assert((size_t)(tempBuf.data()) % 16 == 0);
+
+    fcvHoughCircleu8(src.data, src.cols, src.rows, src.step,
+                     (fcvCircle*)circles.data, &nCircles, maxCircles,
+                     minDist, cannyThreshold, accThreshold,
+                     minRadius, maxRadius, tempBuf.data());
+
+    _circles.create(1, nCircles, CV_32SC3);
+    circles(Range::all(), Range(0, nCircles)).copyTo(_circles);
+}
+
 } // fastcv::
 } // cv::
diff --git a/modules/fastcv/src/ipptransform.cpp b/modules/fastcv/src/ipptransform.cpp
new file mode 100644
index 00000000000..d5bfb259074
--- /dev/null
+++ b/modules/fastcv/src/ipptransform.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void DCT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_16SC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvDCTu8(src.data, src.cols, src.rows, src.step, (short*)dst.data, dst.step);
+}
+
+void IDCT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_16SC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_8UC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvIDCTs16((const short*)src.data, src.cols, src.rows, src.step, dst.data, dst.step);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/moments.cpp b/modules/fastcv/src/moments.cpp
index 3a0c4249eef..e40c85a1bc8 100644
--- a/modules/fastcv/src/moments.cpp
+++ b/modules/fastcv/src/moments.cpp
@@ -20,36 +20,30 @@ cv::Moments moments(InputArray _src, bool binary)
     Mat src = _src.getMat();
 
     cv::Moments m;
-	if( size.width == 0 || size.height == 0 )
-        return m;
-
-	fcvMoments* mFCV = new fcvMoments();
+    fcvMoments mFCV;
     fcvStatus status = FASTCV_SUCCESS;
 	if(binary)
     {
-		cv::Mat src_binary(size, CV_8UC1);
-		cv::compare( src, 0, src_binary, cv::CMP_NE );
-		fcvImageMomentsu8(src_binary.data, src_binary.cols,
-		                  src_binary.rows, src_binary.step, mFCV, binary);
+        cv::Mat src_binary(size, CV_8UC1);
+        cv::compare( src, 0, src_binary, cv::CMP_NE );
+        fcvImageMomentsu8(src_binary.data, src_binary.cols,
+                        src_binary.rows, src_binary.step, &mFCV, binary);
+    }
+    else
+    {
+        switch(type)
+        {
+            case CV_8UC1:
+                fcvImageMomentsu8(src.data, src.cols, src.rows, src.step[0], &mFCV, binary);
+                break;
+            case CV_32SC1:
+                fcvImageMomentss32(src.ptr<int>(), src.cols, src.rows, src.step[0], &mFCV, binary);
+                break;
+            case CV_32FC1:
+                fcvImageMomentsf32(src.ptr<float>(), src.cols, src.rows, src.step[0], &mFCV, binary);
+                break;
+        }
     }
-	else
-	{
-		switch(type)
-		{
-			case CV_8UC1:
-			    fcvImageMomentsu8(src.data, src.cols, src.rows,
-				                  src.step, mFCV, binary);
-				break;
-			case CV_32SC1:
-			    fcvImageMomentss32((const int*)src.data, src.cols, src.rows,
-				                  src.step, mFCV, binary);
-				break;
-			case CV_32FC1:
-			    fcvImageMomentsf32((const float*)src.data, src.cols, src.rows,
-				                  src.step, mFCV, binary);
-				break;
-		}
-	}
 
 	if (status != FASTCV_SUCCESS)
     {
diff --git a/modules/fastcv/src/mser.cpp b/modules/fastcv/src/mser.cpp
index ae8519313be..a564e007a12 100644
--- a/modules/fastcv/src/mser.cpp
+++ b/modules/fastcv/src/mser.cpp
@@ -8,46 +8,99 @@
 namespace cv {
 namespace fastcv {
 
-static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                    std::vector<ContourData>& contourData,
-                    bool useBoundingBoxes = true,
-                    bool useContourData = true,
-                    unsigned int numNeighbors = 4,
-                    unsigned int delta = 2,
-                    unsigned int minArea = 30,
-                    unsigned int maxArea = 14400,
-                    float        maxVariation = 0.15f,
-                    float        minDiversity = 0.2f)
+class MSER_Impl CV_FINAL : public cv::fastcv::MSER
 {
-    INITIALIZATION_CHECK;
+public:
+    explicit MSER_Impl(cv::Size     imgSize,
+                       unsigned int numNeighbors,
+                       unsigned int delta,
+                       unsigned int minArea,
+                       unsigned int maxArea,
+                       float        maxVariation,
+                       float        minDiversity);
 
-    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
-    CV_Assert(_src.cols() > 50);
-    CV_Assert(_src.rows() > 5);
+    ~MSER_Impl() CV_OVERRIDE;
 
-    Mat src = _src.getMat();
+    cv::Size     getImgSize()      CV_OVERRIDE { return imgSize;      };
+    unsigned int getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
+    unsigned int getDelta()        CV_OVERRIDE { return delta;        };
+    unsigned int getMinArea()      CV_OVERRIDE { return minArea;      };
+    unsigned int getMaxArea()      CV_OVERRIDE { return maxArea;      };
+    float        getMaxVariation() CV_OVERRIDE { return maxVariation; };
+    float        getMinDiversity() CV_OVERRIDE { return minDiversity; };
 
-    CV_Assert(numNeighbors == 4 || numNeighbors == 8);
-    bool useNN4 = (numNeighbors == 4);
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours) CV_OVERRIDE;
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) CV_OVERRIDE;
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                                std::vector<ContourData>& contourData) CV_OVERRIDE;
 
-    bool usePointsArray = !useNN4;
+    void detectRegions(InputArray src,
+                       std::vector<std::vector<Point>>& contours,
+                       std::vector<cv::Rect>& boundingBoxes,
+                       std::vector<ContourData>& contourData,
+                       bool useBoundingBoxes = true,
+                       bool useContourData = true);
+
+    cv::Size imgSize;
+    unsigned int numNeighbors;
+    unsigned int delta;
+    unsigned int minArea;
+    unsigned int maxArea;
+    float        maxVariation;
+    float        minDiversity;
 
     void *mserHandle;
+};
 
-    bool isInitOk = false;
-    if (useNN4)
-    {
-        isInitOk = fcvMserInit(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
-    }
-    else
-    {
-        isInitOk = fcvMserNN8Init(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
-    }
 
-    if (!isInitOk)
+MSER_Impl::MSER_Impl(cv::Size     _imgSize,
+                     unsigned int _numNeighbors,
+                     unsigned int _delta,
+                     unsigned int _minArea,
+                     unsigned int _maxArea,
+                     float        _maxVariation,
+                     float        _minDiversity)
+{
+    CV_Assert(_imgSize.width > 50);
+    CV_Assert(_imgSize.height > 5);
+
+    CV_Assert(_numNeighbors == 4 || _numNeighbors == 8);
+
+    INITIALIZATION_CHECK;
+
+    this->imgSize       = _imgSize;
+    this->numNeighbors  = _numNeighbors;
+    this->delta         = _delta;
+    this->minArea       = _minArea;
+    this->maxArea       = _maxArea;
+    this->maxVariation  = _maxVariation;
+    this->minDiversity  = _minDiversity;
+
+    auto initFunc = (this->numNeighbors == 4) ? fcvMserInit : fcvMserNN8Init;
+
+    if (!initFunc(this->imgSize.width, this->imgSize.height, this->delta, this->minArea, this->maxArea,
+                  this->maxVariation, this->minDiversity, &this->mserHandle))
     {
         CV_Error(cv::Error::StsInternal, "Failed to initialize MSER");
     }
+}
+
+
+MSER_Impl::~MSER_Impl()
+{
+    fcvMserRelease(mserHandle);
+}
+
+
+void MSER_Impl::detectRegions(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                              std::vector<ContourData>& contourData, bool useBoundingBoxes, bool useContourData)
+{
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.size() == this->imgSize);
+
+    Mat src = _src.getMat();
+
+    bool usePointsArray = (this->numNeighbors == 8);
 
     //bufSize for pts and bboxes
     const unsigned int maxContours = 16384;
@@ -76,7 +129,7 @@ static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours,
     std::vector<int8_t> contourPolarity(maxContours);
 
     int mserRetcode = -1;
-    if (useNN4)
+    if (this->numNeighbors == 4)
     {
         mserRetcode = fcvMserExtu8_v3(mserHandle, src.data, src.cols, src.rows, src.step,
                                       maxContours, &numContours,
@@ -170,33 +223,37 @@ static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours,
             contourData.push_back(data);
         }
     }
-
-    fcvMserRelease(mserHandle);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>> &contours,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>> &contours)
 {
     std::vector<cv::Rect> boundingBoxes;
     std::vector<ContourData> contourData;
-    runMSER(_src, contours, boundingBoxes, contourData, false, false, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ false, /*useContourData*/ false);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes)
 {
     std::vector<ContourData> contourData;
-    runMSER(_src, contours, boundingBoxes, contourData, true, false, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ false);
+}
+
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                       std::vector<ContourData>& contourData)
+{
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ true);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes, std::vector<ContourData>& contourData,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+Ptr<MSER> MSER::create(cv::Size     imgSize,
+                       unsigned int numNeighbors,
+                       unsigned int delta,
+                       unsigned int minArea,
+                       unsigned int maxArea,
+                       float        maxVariation,
+                       float        minDiversity)
 {
-    runMSER(_src, contours, boundingBoxes, contourData, true, true, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    return makePtr<MSER_Impl>(imgSize, numNeighbors, delta, minArea, maxArea, maxVariation, minDiversity);
 }
 
 } // fastcv::
-} // cv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/precomp.hpp b/modules/fastcv/src/precomp.hpp
index d33cb25bafb..c2929d76cc1 100644
--- a/modules/fastcv/src/precomp.hpp
+++ b/modules/fastcv/src/precomp.hpp
@@ -28,6 +28,9 @@ namespace fastcv {
     CV_INSTRUMENT_REGION();                                                 \
 }
 
+#define FCV_KernelSize_SHIFT 3
+#define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
+
 const std::map<fcvStatus, std::string> fcvStatusStrings =
 {
     { FASTCV_SUCCESS,       "Success"},
diff --git a/modules/fastcv/src/pyramid.cpp b/modules/fastcv/src/pyramid.cpp
new file mode 100644
index 00000000000..24dd4928899
--- /dev/null
+++ b/modules/fastcv/src/pyramid.cpp
@@ -0,0 +1,180 @@
+// License text goes here
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void sobelPyramid(InputArrayOfArrays _pyr, OutputArrayOfArrays _dx, OutputArrayOfArrays _dy, int outType)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(_pyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dx.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dx.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dx.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dy.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dy.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dy.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+    std::vector<cv::Mat> pyr;
+    _pyr.getMatVector(pyr);
+    size_t nLevels = pyr.size();
+
+    CV_Assert(!pyr.empty());
+
+    // this should be smaller I guess
+    CV_Assert(nLevels > 0 && nLevels < 16);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        // fcvPyramidLeved does not support other cases
+        CV_Assert(pyr[i].isContinuous());
+        CV_Assert(pyr[i].type() == CV_8UC1);
+    }
+
+    CV_Assert(outType == CV_8S || outType == CV_16S || outType == CV_32F);
+
+    std::vector<fcvPyramidLevel> lpyr;
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        fcvPyramidLevel lev;
+        lev.width  = pyr[i].cols;
+        lev.height = pyr[i].rows;
+        lev.ptr    = pyr[i].data;
+        lpyr.push_back(lev);
+    }
+
+    std::vector<fcvPyramidLevel> ldx(nLevels), ldy(nLevels);
+    int pyrElemSz = (outType == CV_8S ) ? 1 :
+                    (outType == CV_16S) ? 2 :
+                    (outType == CV_32F) ? 4 : 0;
+    int retCodex = fcvPyramidAllocate(ldx.data(), pyr[0].cols, pyr[0].rows, pyrElemSz, nLevels, 1);
+    if (retCodex != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("fcvPyramidAllocate returned code %d", retCodex));
+    }
+    int retCodey = fcvPyramidAllocate(ldy.data(), pyr[0].cols, pyr[0].rows, pyrElemSz, nLevels, 1);
+    if (retCodey != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("fcvPyramidAllocate returned code %d", retCodey));
+    }
+
+    int returnCode = -1;
+    switch (outType)
+    {
+    case CV_8S:  returnCode = fcvPyramidSobelGradientCreatei8 (lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    case CV_16S: returnCode = fcvPyramidSobelGradientCreatei16(lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    case CV_32F: returnCode = fcvPyramidSobelGradientCreatef32(lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    default:
+        break;
+    }
+
+    if (returnCode != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("FastCV returned code %d", returnCode));
+    }
+
+    // resize arrays of Mats
+    _dx.create(1, nLevels, /* type does not matter here */ -1, -1);
+    _dy.create(1, nLevels, /* type does not matter here */ -1, -1);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        cv::Mat dx((int)ldx[i].height, (int)ldx[i].width, outType, (uchar*)ldx[i].ptr);
+        _dx.create(pyr[i].size(), outType, i);
+        dx.copyTo(_dx.getMat(i));
+
+        cv::Mat dy((int)ldy[i].height, (int)ldy[i].width, outType, (uchar*)ldy[i].ptr);
+        _dy.create(pyr[i].size(), outType, i);
+        dy.copyTo(_dy.getMat(i));
+    }
+
+    fcvPyramidDelete(ldx.data(), nLevels, 0);
+    fcvPyramidDelete(ldy.data(), nLevels, 0);
+}
+
+
+void buildPyramid(InputArray _src, OutputArrayOfArrays _pyr, int nLevels, bool scaleBy2, int borderType, uint8_t borderValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && (_src.type() == CV_8UC1 || _src.type() == CV_32FC1));
+    CV_Assert(_src.step() % 8 == 0);
+
+    cv::Mat src = _src.getMat();
+    bool useFloat = src.depth() == CV_32F;
+    int bytesPerPixel = useFloat ? 4 : 1;
+
+    CV_Assert(_pyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+    // this should be smaller I guess
+    CV_Assert(nLevels > 0 && nLevels < 16);
+
+    if (useFloat && !scaleBy2)
+    {
+        CV_Error( cv::Error::StsBadArg, "ORB scale is not supported for float images (fcvPyramidCreatef32_v2)");
+    }
+
+    fcvPyramidScale scaleOption = scaleBy2 ? FASTCV_PYRAMID_SCALE_HALF : FASTCV_PYRAMID_SCALE_ORB;
+    fcvBorderType borderOption;
+    switch (borderType)
+    {
+    case cv::BORDER_REFLECT:     borderOption = FASTCV_BORDER_REFLECT;    break;
+    case cv::BORDER_REFLECT_101: borderOption = FASTCV_BORDER_REFLECT_V2; break;
+    case cv::BORDER_REPLICATE:   borderOption = FASTCV_BORDER_REPLICATE;  break;
+    default:                     borderOption = FASTCV_BORDER_UNDEFINED;  break;
+    }
+
+    std::vector<fcvPyramidLevel_v2> lpyrSrc2(nLevels);
+
+    int alignment = 8;
+    if (useFloat)
+    {
+        // use version 2
+        CV_Assert(fcvPyramidAllocate_v2(lpyrSrc2.data(), src.cols, src.rows, src.step, bytesPerPixel, nLevels, 0) == 0);
+        CV_Assert(fcvPyramidCreatef32_v2((const float*)src.data, src.cols, src.rows, src.step, nLevels, lpyrSrc2.data()) == 0);
+    }
+    else
+    {
+        // use version 4
+        fcvStatus statusAlloc = fcvPyramidAllocate_v3(lpyrSrc2.data(), src.cols, src.rows, src.step,
+                                                      bytesPerPixel, alignment, nLevels, scaleOption, 0);
+        if (statusAlloc != FASTCV_SUCCESS)
+        {
+            std::string s = fcvStatusStrings.count(statusAlloc) ? fcvStatusStrings.at(statusAlloc) : "unknown";
+            CV_Error( cv::Error::StsInternal, "fcvPyramidAllocate_v3 error: " + s);
+        }
+
+        fcvStatus statusPyr = fcvPyramidCreateu8_v4(src.data, src.cols, src.rows, src.step, nLevels, scaleOption,
+                                                    lpyrSrc2.data(), borderOption, borderValue);
+        if (statusPyr != FASTCV_SUCCESS)
+        {
+            std::string s = fcvStatusStrings.count(statusPyr) ? fcvStatusStrings.at(statusPyr) : "unknown";
+            CV_Error( cv::Error::StsInternal, "fcvPyramidCreateu8_v4 error: " + s);
+        }
+    }
+
+    // create vector
+    _pyr.create(nLevels, 1, src.type(), -1);
+    for (int i = 0; i < nLevels; i++)
+    {
+        cv::Mat m = cv::Mat((unsigned int)lpyrSrc2[i].height, (unsigned int)lpyrSrc2[i].width,
+                             src.type(), (void*)lpyrSrc2[i].ptr, (size_t)lpyrSrc2[i].stride);
+
+        _pyr.create(m.size(), m.type(), i);
+        m.copyTo(_pyr.getMat(i));
+    }
+
+    fcvPyramidDelete_v2(lpyrSrc2.data(), nLevels, 1);
+}
+
+} // namespace fastcv
+} // namespace cv
diff --git a/modules/fastcv/src/remap.cpp b/modules/fastcv/src/remap.cpp
index a0b4849ac72..0c86d65c97e 100644
--- a/modules/fastcv/src/remap.cpp
+++ b/modules/fastcv/src/remap.cpp
@@ -43,7 +43,7 @@ class RemapParallel : public cv::ParallelLoopBody {
 
         if(status!=FASTCV_SUCCESS)
         {
-			std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+            std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
             CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
         }
     }
diff --git a/modules/fastcv/src/tracking.cpp b/modules/fastcv/src/tracking.cpp
new file mode 100644
index 00000000000..dee6b17ee55
--- /dev/null
+++ b/modules/fastcv/src/tracking.cpp
@@ -0,0 +1,266 @@
+// License text goes here
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+static void trackOpticalFlowLKInternal(InputArray _src, InputArray _dst,
+                                       InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                                       InputArrayOfArrays _srcDxPyr, InputArrayOfArrays _srcDyPyr,
+                                       InputArray _ptsIn, OutputArray _ptsOut, InputArray _ptsEst,
+                                       OutputArray _statusVec, cv::Size winSize,
+                                       cv::TermCriteria termCriteria)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(winSize.width % 2 == 1 && winSize.height % 2 == 1);
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(!_dst.empty() && _dst.type() == CV_8UC1);
+    CV_Assert(_src.size() == _dst.size());
+    CV_Assert(_src.step() % 8 == 0);
+    CV_Assert(_dst.step() == _src.step());
+
+    cv::Mat src = _src.getMat(), dst = _dst.getMat();
+
+    CV_Assert(_srcPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _srcPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _srcPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dstPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dstPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dstPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_srcPyr.size() == _dstPyr.size());
+
+    int nLevels = _srcPyr.size().area();
+
+    std::vector<cv::Mat> srcPyr, dstPyr;
+    _srcPyr.getMatVector(srcPyr);
+    _dstPyr.getMatVector(dstPyr);
+
+    cv::Size imSz = src.size();
+    for (int i = 0; i < nLevels; i++)
+    {
+        const cv::Mat& s = srcPyr[i];
+        const cv::Mat& d = dstPyr[i];
+
+        CV_Assert(!s.empty() && s.type() == CV_8UC1);
+        CV_Assert(!d.empty() && d.type() == CV_8UC1);
+        CV_Assert(s.size() == imSz);
+        CV_Assert(d.size() == imSz);
+
+        imSz.width /= 2; imSz.height /= 2;
+    }
+
+    bool useDxDy = !_srcDxPyr.empty() && !_srcDyPyr.empty();
+    int version = useDxDy ? 1 : 3;
+
+    std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+    if (version == 1)
+    {
+        CV_Assert(_srcDxPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+                  _srcDxPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+                  _srcDxPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+        CV_Assert(_srcDyPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+                  _srcDyPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+                  _srcDyPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+        CV_Assert(_srcDxPyr.size() == _srcDyPyr.size());
+        _srcDxPyr.getMatVector(srcDxPyr);
+        _srcDyPyr.getMatVector(srcDyPyr);
+
+        imSz = src.size();
+        for (int i = 0; i < nLevels; i++)
+        {
+            const cv::Mat& dx = srcDxPyr[i];
+            const cv::Mat& dy = srcDyPyr[i];
+
+            CV_Assert(!dx.empty() && dx.type() == CV_8SC1);
+            CV_Assert(!dy.empty() && dy.type() == CV_8SC1);
+            CV_Assert(dx.size() == imSz);
+            CV_Assert(dy.size() == imSz);
+
+            imSz.width /= 2; imSz.height /= 2;
+        }
+    }
+
+    std::vector<fcvPyramidLevel> lpyrSrc1, lpyrDst1, lpyrDxSrc, lpyrDySrc;
+    std::vector<fcvPyramidLevel_v2> lpyrSrc2, lpyrDst2;
+    for (int i = 0; i < nLevels; i++)
+    {
+        fcvPyramidLevel lsrc1, ldst1;
+        fcvPyramidLevel_v2 lsrc2, ldst2;
+        lsrc1.width  = srcPyr[i].cols;
+        lsrc1.height = srcPyr[i].rows;
+        lsrc1.ptr    = srcPyr[i].data;
+
+        lsrc2.width  = srcPyr[i].cols;
+        lsrc2.height = srcPyr[i].rows;
+        lsrc2.stride = srcPyr[i].step;
+        lsrc2.ptr    = srcPyr[i].data;
+
+        ldst1.width  = dstPyr[i].cols;
+        ldst1.height = dstPyr[i].rows;
+        ldst1.ptr    = dstPyr[i].data;
+        ldst2.width  = dstPyr[i].cols;
+        ldst2.height = dstPyr[i].rows;
+        ldst2.stride = dstPyr[i].step;
+        ldst2.ptr    = dstPyr[i].data;
+        lpyrSrc1.push_back(lsrc1); lpyrDst1.push_back(ldst1);
+        lpyrSrc2.push_back(lsrc2); lpyrDst2.push_back(ldst2);
+
+        if (version == 1)
+        {
+            fcvPyramidLevel ldx, ldy;
+            CV_Assert(srcDxPyr[i].isContinuous());
+            ldx.width  = srcDxPyr[i].cols;
+            ldx.height = srcDxPyr[i].rows;
+            ldx.ptr    = srcDxPyr[i].data;
+            CV_Assert(srcDyPyr[i].isContinuous());
+            ldy.width  = srcDyPyr[i].cols;
+            ldy.height = srcDyPyr[i].rows;
+            ldy.ptr    = srcDyPyr[i].data;
+            lpyrDxSrc.push_back(ldx); lpyrDySrc.push_back(ldy);
+        }
+    }
+
+    CV_Assert(!_ptsIn.empty() && (_ptsIn.type() == CV_32FC1 || _ptsIn.type() == CV_32FC2));
+    CV_Assert(_ptsIn.isContinuous());
+    CV_Assert(_ptsIn.total() * _ptsIn.channels() % 2 == 0);
+
+    cv::Mat ptsIn = _ptsIn.getMat();
+    int nPts = ptsIn.total() * ptsIn.channels() / 2;
+
+    bool useInitialEstimate;
+    cv::Mat ptsEst;
+    const float32_t* ptsEstData;
+    if (!_ptsEst.empty())
+    {
+        CV_Assert(_ptsEst.type() == CV_32FC1 || _ptsEst.type() == CV_32FC2);
+        CV_Assert(_ptsEst.isContinuous());
+        int estElems = _ptsEst.total() * _ptsEst.channels();
+        CV_Assert(estElems % 2 == 0);
+        CV_Assert(estElems / 2 == nPts);
+
+        ptsEst = _ptsEst.getMat();
+        ptsEstData = (const float32_t*)ptsEst.data;
+        useInitialEstimate = true;
+    }
+    else
+    {
+        useInitialEstimate = false;
+        ptsEstData = (const float32_t*)ptsIn.data;
+    }
+
+    CV_Assert(_ptsOut.needed());
+    _ptsOut.create(1, nPts, CV_32FC2);
+    cv::Mat ptsOut = _ptsOut.getMat();
+
+    cv::Mat statusVec;
+    if (!_statusVec.empty())
+    {
+        _statusVec.create(1, nPts, CV_32SC1);
+        statusVec = _statusVec.getMat();
+    }
+    else
+    {
+        statusVec = cv::Mat(1, nPts, CV_32SC1);
+    }
+
+    fcvTerminationCriteria termCrit;
+    if (termCriteria.type & cv::TermCriteria::COUNT)
+    {
+        if (termCriteria.type & cv::TermCriteria::EPS)
+        {
+            termCrit = FASTCV_TERM_CRITERIA_BOTH;
+        }
+        else
+        {
+            termCrit = FASTCV_TERM_CRITERIA_ITERATIONS;
+        }
+    }
+    else
+    {
+        if (termCriteria.type & cv::TermCriteria::EPS)
+        {
+            termCrit = FASTCV_TERM_CRITERIA_EPSILON;
+        }
+        else
+        {
+            CV_Error(cv::Error::StsBadArg, "Incorrect termination criteria");
+        }
+    }
+    int maxIterations = termCriteria.maxCount;
+    double maxEpsilon = termCriteria.epsilon;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (version == 3)
+    {
+        status = fcvTrackLKOpticalFlowu8_v3(src.data, dst.data, src.cols, src.rows, src.step,
+                                            lpyrSrc2.data(), lpyrDst2.data(),
+                                            (const float32_t*)ptsIn.data,
+                                            ptsEstData,
+                                            (float32_t*)ptsOut.data,
+                                            (int32_t*)statusVec.data,
+                                            nPts,
+                                            winSize.width, winSize.height,
+                                            nLevels,
+                                            termCrit, maxIterations, maxEpsilon,
+                                            useInitialEstimate);
+    }
+    else // if (version == 1)
+    {
+        CV_Assert(src.isContinuous() && dst.isContinuous());
+        // Obsolete parameters, set to 0
+        float maxResidue = 0, minDisplacement = 0, minEigenvalue = 0;
+        int lightingNormalized = 0;
+        fcvTrackLKOpticalFlowu8(src.data, dst.data, src.cols, src.rows,
+                                lpyrSrc1.data(), lpyrDst1.data(),
+                                lpyrDxSrc.data(), lpyrDySrc.data(),
+                                (const float32_t*)ptsIn.data,
+                                (float32_t*)ptsOut.data,
+                                (int32_t*)statusVec.data,
+                                nPts,
+                                winSize.width, winSize.height,
+                                maxIterations,
+                                nLevels,
+                                maxResidue, minDisplacement, minEigenvalue, lightingNormalized);
+    }
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+
+void trackOpticalFlowLK(InputArray _src, InputArray _dst,
+                        InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                        InputArray _ptsIn, OutputArray _ptsOut, InputArray _ptsEst,
+                        OutputArray _statusVec, cv::Size winSize,
+                        cv::TermCriteria termCriteria)
+{
+    trackOpticalFlowLKInternal(_src, _dst, _srcPyr, _dstPyr, noArray(), noArray(),
+                               _ptsIn, _ptsOut, _ptsEst,
+                               _statusVec, winSize,
+                               termCriteria);
+}
+
+void trackOpticalFlowLK(InputArray _src, InputArray _dst,
+                        InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                        InputArrayOfArrays _srcDxPyr, InputArrayOfArrays _srcDyPyr,
+                        InputArray _ptsIn, OutputArray _ptsOut,
+                        OutputArray _statusVec, cv::Size winSize, int maxIterations)
+{
+    trackOpticalFlowLKInternal(_src, _dst, _srcPyr, _dstPyr,
+                               _srcDxPyr, _srcDyPyr,
+                               _ptsIn, _ptsOut, cv::noArray(),
+                               _statusVec, winSize,
+                               {cv::TermCriteria::MAX_ITER | cv::TermCriteria::EPS,
+                                maxIterations, /* maxEpsilon */ 0.03f * 0.03f});
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/test/test_bilateral.cpp b/modules/fastcv/test/test_bilateral.cpp
index 4f582c2ed37..5c883801a92 100644
--- a/modules/fastcv/test/test_bilateral.cpp
+++ b/modules/fastcv/test/test_bilateral.cpp
@@ -10,20 +10,20 @@ namespace opencv_test { namespace {
 typedef testing::TestWithParam<tuple<cv::Size,int,int>> fcv_bilateralFilterTest;
 
 TEST_P(fcv_bilateralFilterTest, accuracy)
-{	
+{
     cv::Size size  = get<0>(GetParam());
 	int d = get<1>(GetParam());
     double sigmaColor = get<2>(GetParam());
-	double sigmaSpace = sigmaColor;
-	
-	RNG& rng = cv::theRNG();
+    double sigmaSpace = sigmaColor;
+
+    RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
 
     cv::Mat dst;
 
-	cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
-	
+    cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+
     EXPECT_FALSE(dst.empty());
 }
 
diff --git a/modules/fastcv/test/test_fft.cpp b/modules/fastcv/test/test_fft.cpp
index 18b53d88ba0..ef70f8e12f5 100644
--- a/modules/fastcv/test/test_fft.cpp
+++ b/modules/fastcv/test/test_fft.cpp
@@ -39,7 +39,6 @@ TEST_P(FFTExtTest, inverse)
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
-    //cv::Mat src = imread(cvtest::findDataFile("cv/shared/lena.png"), IMREAD_GRAYSCALE);
 
     Mat srcFloat;
     src.convertTo(srcFloat, CV_32F);
diff --git a/modules/fastcv/test/test_hough.cpp b/modules/fastcv/test/test_hough.cpp
index 31bfca6430c..dd1068d661d 100644
--- a/modules/fastcv/test/test_hough.cpp
+++ b/modules/fastcv/test/test_hough.cpp
@@ -102,4 +102,97 @@ INSTANTIATE_TEST_CASE_P(FastCV_Extension, HoughLinesTest,
                                            ::testing::Values(0.05, 0.25, 0.5, 0.75) // threshold
                                            ));
 
+
+typedef std::tuple<std::string /* file name */, uint32_t /* minDist */,   uint32_t /* cannyThreshold */,
+                   uint32_t /* accThreshold */, uint32_t /* minRadius */, uint32_t /* maxRadius */> HoughCirclesTestParams;
+class HoughCirclesTest : public ::testing::TestWithParam<HoughCirclesTestParams> {};
+
+TEST_P(HoughCirclesTest, accuracy)
+{
+    auto p = GetParam();
+    std::string fname       = std::get<0>(p);
+    uint32_t minDist        = std::get<1>(p);
+    uint32_t cannyThreshold = std::get<2>(p);
+    uint32_t accThreshold   = std::get<3>(p);
+    uint32_t minRadius      = std::get<4>(p);
+    uint32_t maxRadius      = std::get<5>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile(fname), cv::IMREAD_GRAYSCALE);
+    // make it aligned by 8
+    cv::Mat withBorder;
+    int bpix = ((src.cols & 0xfffffff8) + 8) - src.cols;
+    cv::copyMakeBorder(src, withBorder, 0, 0, 0, bpix, BORDER_REFLECT101);
+    src = withBorder;
+
+    std::vector<cv::Vec3f> refCircles;
+    cv::HoughCircles(src, refCircles, HOUGH_GRADIENT, 1.5, minDist,
+                     cannyThreshold, accThreshold,
+                     minRadius, maxRadius);
+
+    Mat icircles;
+    cv::fastcv::houghCircles(src, icircles, minDist,
+                             cannyThreshold, accThreshold,
+                             minRadius, maxRadius);
+
+    std::vector<cv::Vec3f> circles;
+    icircles.convertTo(circles, CV_32FC3);
+
+    // usually the number of detected circles is small, brute force is OK
+    float totalDist = 0;
+    for (size_t i = 0; i < circles.size(); i++)
+    {
+        cv::Vec3f c = circles[i];
+        float dist = std::numeric_limits<float>::max();
+        for (size_t j = 0; j < refCircles.size(); j++)
+        {
+            cv::Vec3f rc = refCircles[i];
+            float d = (rc - c).ddot(rc - c);
+            if (d < dist)
+            {
+                dist = d;
+            }
+        }
+        totalDist += dist;
+    }
+    totalDist = std::sqrt(totalDist);
+
+    EXPECT_LT(totalDist, 554.0);
+
+    if (cvtest::debugLevel > 0)
+    {
+        cv::Mat draw;
+        cvtColor(src, draw, COLOR_GRAY2BGR);
+        cv::Mat refDraw = draw.clone();
+        for (const cv::Vec3f& c : refCircles)
+        {
+            cv::Point center(c[0], c[1]);
+            cv::circle(refDraw, center, c[2], Scalar(0, 255, 0));
+        }
+        for (const cv::Vec3f& c : circles)
+        {
+            cv::Point center(c[0], c[1]);
+            cv::circle(draw, center, c[2], Scalar(0, 255, 0));
+        }
+        std::cout << "circles: " << circles.size() << std::endl;
+        size_t idx = fname.find_last_of("/\\");
+        std::string fout = fname.substr(idx+1, fname.length() - idx - 5);
+        cv::imwrite(cv::format("circle_%s_mdt%d_can%d_acc%d_rf%d_rt%d_ref.png", fout.c_str(),
+                               minDist, cannyThreshold, accThreshold, minRadius, maxRadius), refDraw);
+        cv::imwrite(cv::format("circle_%s_mdt%d_can%d_acc%d_rf%d_rt%d_fcv.png", fout.c_str(),
+                               minDist, cannyThreshold, accThreshold, minRadius, maxRadius), draw);
+    }
+}
+
+// NOTE: test files should be manually loaded to folder on a device, for example like this:
+// adb push fastcv/misc/hough/ /sdcard/testdata/fastcv/hough/
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, HoughCirclesTest,
+                        ::testing::Values(
+                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 10, 100 },
+                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 30, 100 },
+                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 50, 100 },
+                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 10, 100 },
+                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 30, 100 },
+                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 50, 100 }
+                         ));
+
 }} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_ipptransform.cpp b/modules/fastcv/test/test_ipptransform.cpp
new file mode 100644
index 00000000000..66ff8cbd59d
--- /dev/null
+++ b/modules/fastcv/test/test_ipptransform.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+class DCTExtTest : public ::testing::TestWithParam<cv::Size> {};
+
+TEST_P(DCTExtTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat dst, ref;
+    cv::fastcv::DCT(src, dst);
+
+    cv::dct(srcFloat, ref);
+
+    Mat dstFloat;
+    ref.convertTo(dstFloat, CV_32F);
+
+    double normInf = cvtest::norm(dstFloat, ref, cv::NORM_INF);
+    double normL2  = cvtest::norm(dstFloat, ref, cv::NORM_L2)  / dst.size().area();
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << "dst:" << std::endl << dst << std::endl;
+        std::cout << "ref:" << std::endl << ref << std::endl;
+    }
+
+    EXPECT_EQ(normInf, 0);
+    EXPECT_EQ(normL2, 0);
+}
+
+TEST_P(DCTExtTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat fwd, back;
+    cv::fastcv::DCT(src, fwd);
+    cv::fastcv::IDCT(fwd, back);
+    Mat backFloat;
+    back.convertTo(backFloat, CV_32F);
+
+    Mat fwdRef, backRef;
+    cv::dct(srcFloat, fwdRef);
+    cv::idct(fwdRef, backRef);
+
+    double normInf = cvtest::norm(backFloat, backRef, cv::NORM_INF);
+    double normL2  = cvtest::norm(backFloat, backRef, cv::NORM_L2)  / src.size().area();
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << "src:"     << std::endl << src     << std::endl;
+        std::cout << "back:"    << std::endl << back    << std::endl;
+        std::cout << "backRef:" << std::endl << backRef << std::endl;
+    }
+
+    EXPECT_LE(normInf, 7.00005);
+    EXPECT_LT(normL2,  0.13);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, DCTExtTest, ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512)));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_moments.cpp b/modules/fastcv/test/test_moments.cpp
index 1d23156dcf2..50bbc812554 100644
--- a/modules/fastcv/test/test_moments.cpp
+++ b/modules/fastcv/test/test_moments.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/moments.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -30,14 +29,25 @@ TEST_P(fcv_momentsTest, accuracy)
 
 	cv::Moments m = cv::fastcv::moments(src, binaryImage);
 
-    int len_m = sizeof(m)/sizeof(m.m00);
-    EXPECT_FALSE(len_m != 24);
+    cv::Scalar mean_val, stdDev;
+    float mean_val_fcv = m.m00/(srcSize.width * srcSize.height);
+    if(binaryImage)
+    {
+        cv::Mat src_binary(srcSize, CV_8UC1);
+        cv::compare( src, 0, src_binary, cv::CMP_NE );
+        mean_val = cv::mean(src_binary);
+        mean_val_fcv *= 255;
+    }
+    else
+        mean_val = cv::mean(src);
+
+    EXPECT_NEAR(mean_val[0], mean_val_fcv, 2);
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, fcv_momentsTest, Combine(
                    Values(false, true),
                    Values(TYPICAL_MAT_SIZES),
-                   Values(CV_8UC1, CV_32SC1, CV_32FC1)			   
+                   Values(CV_8UC1, CV_32SC1, CV_32FC1)
 ));
 
 }
diff --git a/modules/fastcv/test/test_mser.cpp b/modules/fastcv/test/test_mser.cpp
index ebacbad32f3..6f2bf78c4cd 100644
--- a/modules/fastcv/test/test_mser.cpp
+++ b/modules/fastcv/test/test_mser.cpp
@@ -31,24 +31,24 @@ TEST_P(MSERTest, accuracy)
 
     std::vector<std::vector<Point>> contours;
     std::vector<cv::Rect> bboxes;
-    std::vector<cv::fastcv::ContourData> contourData;
+    std::vector<cv::fastcv::MSER::ContourData> contourData;
+    cv::Ptr<cv::fastcv::MSER> mser;
+    mser = cv::fastcv::MSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+                                    maxVariation, minDiversity);
     if (useBboxes)
     {
         if (useContourData)
         {
-            cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours, bboxes, contourData);
         }
         else
         {
-            cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours, bboxes);
         }
     }
     else
     {
-        cv::fastcv::MSER(src, contours, numNeighbors,
-                         delta, minArea, maxArea, maxVariation, minDiversity);
+        mser->detect(src, contours);
     }
 
     Rect imgRect(0, 0, src.cols, src.rows);
diff --git a/modules/fastcv/test/test_precomp.hpp b/modules/fastcv/test/test_precomp.hpp
index 1b4c23eca30..7ff8ed78049 100644
--- a/modules/fastcv/test/test_precomp.hpp
+++ b/modules/fastcv/test/test_precomp.hpp
@@ -4,6 +4,7 @@
 */
 
 #include <opencv2/ts.hpp>
+#include <opencv2/core/affine.hpp>
 #include <opencv2/features2d.hpp>
 #include <opencv2/video.hpp>
 
diff --git a/modules/fastcv/test/test_pyramid.cpp b/modules/fastcv/test/test_pyramid.cpp
new file mode 100644
index 00000000000..29acf9ab9a7
--- /dev/null
+++ b/modules/fastcv/test/test_pyramid.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<bool /*useFloat*/, int /*nLevels*/, bool /*scaleBy2*/> PyramidTestParams;
+class PyramidTest : public ::testing::TestWithParam<PyramidTestParams> { };
+
+TEST_P(PyramidTest, accuracy)
+{
+    auto par = GetParam();
+
+    bool useFloat = std::get<0>(par);
+    int  nLevels  = std::get<1>(par);
+    bool scaleBy2 = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    if (useFloat)
+    {
+        cv::Mat f;
+        src.convertTo(f, CV_32F);
+        src = f;
+    }
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels, scaleBy2);
+
+    ASSERT_EQ(pyr.size(), (size_t)nLevels);
+
+    std::vector<cv::Mat> refPyr;
+    if (scaleBy2)
+    {
+        cv::buildPyramid(src, refPyr, nLevels - 1);
+    }
+    else // ORB downscaling
+    {
+        for (int i = 0; i < nLevels; i++)
+        {
+            // we don't know how exactly the bit-accurate size is calculated
+            cv::Mat level;
+            cv::resize(src, level, pyr[i].size(), 0, 0, cv::INTER_AREA);
+            refPyr.push_back(level);
+        }
+    }
+
+    for (int i = 0; i < nLevels; i++)
+    {
+        cv::Mat ref = refPyr[i];
+        cv::Mat m = pyr[i];
+        ASSERT_EQ(m.size(), ref.size());
+        double l2diff   = cv::norm(m, ref, cv::NORM_L2);
+        double linfdiff = cv::norm(m, ref, cv::NORM_INF);
+
+        double l2Thresh   = scaleBy2 ? 178.0 : 5216.0;
+        double linfThresh = scaleBy2 ?  16.0 :  116.0;
+        EXPECT_LE(l2diff,   l2Thresh);
+        EXPECT_LE(linfdiff, linfThresh);
+    }
+
+    if (cvtest::debugLevel > 0)
+    {
+        for (int i = 0; i < nLevels; i++)
+        {
+            char tchar = useFloat ? 'f' : 'i';
+            std::string scaleStr = scaleBy2 ? "x2" : "xORB";
+            cv::imwrite(cv::format("pyr_diff_%c_%d_%s_l%d.png", tchar, nLevels, scaleStr.c_str(), i), cv::abs(pyr[i] - refPyr[i]));
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, PyramidTest,
+                        // useFloat, nLevels, scaleBy2
+                        ::testing::Values(
+                            PyramidTestParams { true, 2,  true}, PyramidTestParams { true, 3,  true}, PyramidTestParams { true, 4,  true},
+                            PyramidTestParams {false, 2,  true}, PyramidTestParams {false, 3,  true}, PyramidTestParams {false, 4,  true},
+                            PyramidTestParams {false, 2, false}, PyramidTestParams {false, 3, false}, PyramidTestParams {false, 4, false}
+                            ));
+
+typedef std::tuple<MatType, size_t> SobelPyramidTestParams;
+class SobelPyramidTest : public ::testing::TestWithParam<SobelPyramidTestParams> {};
+
+TEST_P(SobelPyramidTest, accuracy)
+{
+    auto p = GetParam();
+    int    type    = std::get<0>(p);
+    size_t nLevels = std::get<1>(p);
+
+    // NOTE: test files should be manually loaded to folder on a device, for example like this:
+    // adb push fastcv/misc/bilateral_recursive/ /sdcard/testdata/fastcv/bilateral/
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels);
+
+    std::vector<cv::Mat> pyrDx, pyrDy;
+    cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type);
+
+    ASSERT_EQ(pyrDx.size(), nLevels);
+    ASSERT_EQ(pyrDy.size(), nLevels);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        ASSERT_EQ(pyrDx[i].type(), type);
+        ASSERT_EQ(pyrDx[i].size(), pyr[i].size());
+        ASSERT_EQ(pyrDy[i].type(), type);
+        ASSERT_EQ(pyrDy[i].size(), pyr[i].size());
+    }
+
+    std::vector<cv::Mat> refPyrDx(nLevels), refPyrDy(nLevels);
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        int stype = (type == CV_8S) ? CV_16S : type;
+        cv::Mat dx, dy;
+        cv::Sobel(pyr[i], dx, stype, 1, 0);
+        cv::Sobel(pyr[i], dy, stype, 0, 1);
+        dx.convertTo(refPyrDx[i], type, 1.0/8.0, 0.0);
+        dy.convertTo(refPyrDy[i], type, 1.0/8.0, 0.0);
+    }
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        cv::Mat ref, dst;
+        double normInf, normL2;
+        ref = refPyrDx[i];
+        dst = pyrDx[i];
+        normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+        normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
+
+        EXPECT_LE(normInf, 76.1);
+        EXPECT_LT(normL2,   0.4);
+
+        ref = refPyrDy[i];
+        dst = pyrDy[i];
+        normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+        normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
+
+        EXPECT_LE(normInf, 66.6);
+        EXPECT_LT(normL2,   0.4);
+    }
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::map<int, std::string> typeToString =
+        {
+            {CV_8U,   "8u"}, {CV_8S,   "8s"}, {CV_16U, "16u"}, {CV_16S, "16s"},
+            {CV_32S, "32s"}, {CV_32F, "32f"}, {CV_64F, "64f"}, {CV_16F, "16f"},
+        };
+
+        for (size_t i = 0; i < nLevels; i++)
+        {
+            cv::imwrite(cv::format("pyr_l%zu.png", i), pyr[i]);
+            cv::imwrite(cv::format("pyr_sobel_x_t%s_l%zu.png", typeToString.at(type).c_str(), i), pyrDx[i] + 128);
+            cv::imwrite(cv::format("pyr_sobel_y_t%s_l%zu.png", typeToString.at(type).c_str(), i), pyrDy[i] + 128);
+
+            cv::imwrite(cv::format("ref_pyr_sobel_x_t%s_l%zu.png", typeToString.at(type).c_str(), i), refPyrDx[i] + 128);
+            cv::imwrite(cv::format("ref_pyr_sobel_y_t%s_l%zu.png", typeToString.at(type).c_str(), i), refPyrDy[i] + 128);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, SobelPyramidTest, ::testing::Combine(
+    ::testing::Values(CV_8S, CV_16S, CV_32F), // depth
+    ::testing::Values(3, 6))); // nLevels
+
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_remap.cpp b/modules/fastcv/test/test_remap.cpp
index 6fa5ccdabfd..28501534a5d 100644
--- a/modules/fastcv/test/test_remap.cpp
+++ b/modules/fastcv/test/test_remap.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/remap.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -77,12 +76,8 @@ TEST_P(RemapTest, accuracy)
     cv::Mat remapOpenCV;
     cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
 
-    cv::Mat diffImage;
-    cv::absdiff(dst, remapOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(dst, remapOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -105,12 +100,8 @@ TEST_P(RemapTestRGBA, accuracy)
     cv::Mat remapOpenCV;
     cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
 
-    cv::Mat diffImage;
-    cv::absdiff(dst, remapOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(dst, remapOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
diff --git a/modules/fastcv/test/test_scale.cpp b/modules/fastcv/test/test_scale.cpp
index 394fd907cc9..f98c9cc4ceb 100644
--- a/modules/fastcv/test/test_scale.cpp
+++ b/modules/fastcv/test/test_scale.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/scale.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -25,12 +24,8 @@ TEST(resizeDownBy2, accuracy)
     cv::Mat resizedImageOpenCV;
     cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 2, inputImage.rows / 2), 0, 0, INTER_AREA);
 
-    cv::Mat diffImage;
-    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(resized_image, resizedImageOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -50,12 +45,8 @@ TEST(resizeDownBy4, accuracy)
     cv::Mat resizedImageOpenCV;
     cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 4, inputImage.rows / 4), 0, 0, INTER_AREA);
 
-    cv::Mat diffImage;
-    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(resized_image, resizedImageOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -98,14 +89,14 @@ TEST_P(ResizeBy4Test, ResizeBy2) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    ResizeTests, 
-    ResizeBy2Test, 
+    ResizeTests,
+    ResizeBy2Test,
     ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
 ));
 
 INSTANTIATE_TEST_CASE_P(
-    ResizeTests, 
-    ResizeBy4Test, 
+    ResizeTests,
+    ResizeBy4Test,
     ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
 ));
 
diff --git a/modules/fastcv/test/test_tracking.cpp b/modules/fastcv/test/test_tracking.cpp
new file mode 100644
index 00000000000..7833c71b1ec
--- /dev/null
+++ b/modules/fastcv/test/test_tracking.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<int /*winSize*/, bool /*useSobelPyramid*/, bool /*useFastCvPyramids*/, bool /*useInitialEstimate*/ > TrackingTestParams;
+class TrackingTest : public ::testing::TestWithParam<TrackingTestParams> {};
+
+TEST_P(TrackingTest, accuracy)
+{
+    auto par = GetParam();
+
+    int winSz               = std::get<0>(par);
+    bool useSobelPyramid    = std::get<1>(par);
+    bool useFastCvPyramids  = std::get<2>(par);
+    bool useInitialEstimate = std::get<3>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    double ang = 5.0 * CV_PI / 180.0;
+    cv::Matx33d tr = {
+        cos(ang), -sin(ang), 1,
+        sin(ang),  cos(ang), 2,
+               0,         0, 1
+    };
+    cv::Matx33d orig {
+        1, 0, -(double)src.cols / 2,
+        0, 1, -(double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx33d back {
+        1, 0, (double)src.cols / 2,
+        0, 1, (double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx23d trans = (back * tr * orig).get_minor<2, 3>(0, 0);
+
+    cv::Mat dst;
+    cv::warpAffine(src, dst, trans, src.size());
+
+    int nLevels = 4;
+    std::vector<cv::Mat> srcPyr, dstPyr;
+
+    if (useFastCvPyramids)
+    {
+        cv::fastcv::buildPyramid(src, srcPyr, nLevels);
+        cv::fastcv::buildPyramid(dst, dstPyr, nLevels);
+    }
+    else
+    {
+        cv::buildPyramid(src, srcPyr, nLevels - 1);
+        cv::buildPyramid(dst, dstPyr, nLevels - 1);
+    }
+
+    cv::Matx23f transf = trans;
+    int nPts = 32;
+    std::vector<cv::Point2f> ptsIn, ptsOut, ptsEst, ptsExpected;
+    for (int i = 0; i < nPts; i++)
+    {
+        cv::Point2f p { (((float)cv::theRNG())*0.5f + 0.25f) * src.cols,
+                        (((float)cv::theRNG())*0.5f + 0.25f) * src.rows };
+        ptsIn.push_back(p);
+        ptsExpected.push_back(transf * cv::Vec3f(p.x, p.y, 1.0));
+        ptsOut.push_back({ });
+        ptsEst.push_back(p);
+    }
+
+    std::vector<int32_t> statusVec(nPts);
+
+    cv::TermCriteria termCrit;
+    termCrit.type = cv::TermCriteria::COUNT | cv::TermCriteria::EPS;
+    termCrit.maxCount = 7;
+    termCrit.epsilon = 0.03f * 0.03f;
+
+    if (useSobelPyramid)
+    {
+        std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+        cv::fastcv::sobelPyramid(srcPyr, srcDxPyr, srcDyPyr, CV_8S);
+        cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, srcDxPyr, srcDyPyr,
+                                       ptsIn, ptsOut, statusVec, {winSz, winSz});
+    }
+    else
+    {
+        cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, ptsIn, ptsOut, (useInitialEstimate ? ptsEst : noArray()),
+                                        statusVec, {winSz, winSz}, termCrit);
+    }
+
+    std::vector<cv::Point2f> ocvPtsOut;
+    std::vector<uint8_t> ocvStatusVec;
+    std::vector<float> ocvErrVec;
+    cv::calcOpticalFlowPyrLK(src, dst, ptsIn, ocvPtsOut, ocvStatusVec, ocvErrVec, {winSz, winSz}, nLevels - 1, termCrit);
+
+    cv::Mat refStatusVec(nPts, 1, CV_32S, Scalar::all(1));
+    cv::Mat ocvStatusVecInt;
+    cv::Mat(ocvStatusVec).convertTo(ocvStatusVecInt, CV_32S);
+
+    double statusNormOcv = cv::norm(ocvStatusVecInt, refStatusVec, NORM_INF);
+    double statusNorm = cv::norm(cv::Mat(statusVec), refStatusVec, NORM_INF);
+
+    EXPECT_EQ(statusNormOcv, 0);
+    EXPECT_EQ(statusNorm, 0);
+
+    double diffNormOcv = cv::norm(ocvPtsOut, ptsExpected, NORM_L2);
+    double diffNorm = cv::norm(ptsOut, ptsExpected, NORM_L2);
+
+    EXPECT_LT(diffNormOcv, 31.92);
+    EXPECT_LT(diffNorm, 6.69);
+
+    if (cvtest::debugLevel > 0)
+    {
+        auto drawPts = [ptsIn, dst](const std::vector<cv::Point2f>& ptsRes, const std::string fname)
+        {
+            cv::Mat draw = dst.clone();
+            for (size_t i = 0; i < ptsIn.size(); i++)
+            {
+                cv::line(draw, ptsIn[i], ptsRes[i], Scalar::all(255));
+                cv::circle(draw, ptsIn[i], 1, Scalar::all(255));
+                cv::circle(draw, ptsRes[i], 3, Scalar::all(255));
+            }
+            cv::imwrite(fname, draw);
+        };
+
+        drawPts(ptsOut, "track_w"+std::to_string(winSz)+"_warped.png");
+        drawPts(ocvPtsOut, "track_ocv_warped.png");
+
+        std::cout << "status vec:"   << std::endl << cv::Mat(statusVec).t()   << std::endl;
+        std::cout << "status vec ocv:" << std::endl << cv::Mat(ocvStatusVec).t() << std::endl;
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, TrackingTest,
+                        ::testing::Combine(::testing::Values(5, 7, 9), // window size
+                                           ::testing::Bool(),          // useSobelPyramid
+                                           ::testing::Bool(),          // useFastCvPyramids
+                                           ::testing::Bool()           // useInitialEstimate
+                        ));
+
+}} // namespaces opencv_test, ::

From 1c5751bbbafd7ef97e09fa38a3dd717a344a6e90 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Tue, 10 Dec 2024 17:23:37 +0100
Subject: [PATCH 03/11] HoughCircle tests: test images updated

---
 modules/fastcv/perf/perf_hough.cpp | 12 ++++++------
 modules/fastcv/test/test_hough.cpp | 15 ++++++++-------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/modules/fastcv/perf/perf_hough.cpp b/modules/fastcv/perf/perf_hough.cpp
index 53194d3100a..0815cb26f87 100644
--- a/modules/fastcv/perf/perf_hough.cpp
+++ b/modules/fastcv/perf/perf_hough.cpp
@@ -51,12 +51,12 @@ typedef ::perf::TestBaseWithParam<HoughCirclesPerfTestParams> HoughCirclesPerf;
 
 PERF_TEST_P(HoughCirclesPerf, run,
                 ::testing::Values(
-                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 10, 100 },
-                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 30, 100 },
-                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 50, 100 },
-                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 10, 100 },
-                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 30, 100 },
-                            HoughCirclesPerfTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 50, 100 }
+                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 10, 100 },
+                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 30, 100 },
+                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 50, 100 },
+                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 10, 100 },
+                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 30, 100 },
+                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 50, 100 }
                          )
            )
 {
diff --git a/modules/fastcv/test/test_hough.cpp b/modules/fastcv/test/test_hough.cpp
index dd1068d661d..4713ae6df39 100644
--- a/modules/fastcv/test/test_hough.cpp
+++ b/modules/fastcv/test/test_hough.cpp
@@ -156,7 +156,7 @@ TEST_P(HoughCirclesTest, accuracy)
     }
     totalDist = std::sqrt(totalDist);
 
-    EXPECT_LT(totalDist, 554.0);
+    EXPECT_LT(totalDist, 811.0);
 
     if (cvtest::debugLevel > 0)
     {
@@ -187,12 +187,13 @@ TEST_P(HoughCirclesTest, accuracy)
 // adb push fastcv/misc/hough/ /sdcard/testdata/fastcv/hough/
 INSTANTIATE_TEST_CASE_P(FastCV_Extension, HoughCirclesTest,
                         ::testing::Values(
-                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 10, 100 },
-                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 30, 100 },
-                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg", 100, 100, 50, 50, 100 },
-                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 10, 100 },
-                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 30, 100 },
-                            HoughCirclesTestParams {"fastcv/hough/kandinsky-circles_2.jpg",  10, 100, 50, 50, 100 }
+                            // gpu/connectedcomponents/concentric_circles.png
+                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 10, 100 },
+                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 30, 100 },
+                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 50, 100 },
+                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 10, 100 },
+                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 30, 100 },
+                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 50, 100 }
                          ));
 
 }} // namespaces opencv_test, ::

From 5e82b141d9408848262cfbdd1251ddcfb3b5557c Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Tue, 10 Dec 2024 22:33:18 +0100
Subject: [PATCH 04/11] BilateralRecursive: test files updated

---
 modules/fastcv/test/test_smooth.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/fastcv/test/test_smooth.cpp b/modules/fastcv/test/test_smooth.cpp
index 0b73baa5cd5..47c85152ebf 100644
--- a/modules/fastcv/test/test_smooth.cpp
+++ b/modules/fastcv/test/test_smooth.cpp
@@ -39,7 +39,14 @@ TEST_P(BilateralRecursiveTest, accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(FastCV_Extension, BilateralRecursiveTest,
-                        ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
-                                           ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f)));
+                        ::testing::Values(
+                            BilateralTestParams {0.01f, 1.00f},
+                            BilateralTestParams {0.10f, 0.01f},
+                            BilateralTestParams {1.00f, 0.01f},
+                            BilateralTestParams {1.00f, 1.00f},
+                            BilateralTestParams {5.00f, 0.01f},
+                            BilateralTestParams {5.00f, 0.10f},
+                            BilateralTestParams {5.00f, 5.00f}
+                        ));
 
 }} // namespaces opencv_test, ::

From dc02ae4f544e7c656989208a384da3024429fda4 Mon Sep 17 00:00:00 2001
From: Xue Zhang <quic_xuezha@quicinc.com>
Date: Wed, 11 Dec 2024 16:00:36 +0530
Subject: [PATCH 05/11] remove hough circle for segmentation fault

---
 modules/fastcv/perf/perf_hough.cpp | 47 ---------------
 modules/fastcv/src/hough.cpp       | 33 -----------
 modules/fastcv/test/test_hough.cpp | 94 ------------------------------
 3 files changed, 174 deletions(-)

diff --git a/modules/fastcv/perf/perf_hough.cpp b/modules/fastcv/perf/perf_hough.cpp
index 0815cb26f87..78424a696dc 100644
--- a/modules/fastcv/perf/perf_hough.cpp
+++ b/modules/fastcv/perf/perf_hough.cpp
@@ -41,51 +41,4 @@ PERF_TEST_P(HoughLinesPerfTest, run,
     SANITY_CHECK_NOTHING();
 }
 
-
-typedef std::tuple<std::string /* file name */, uint32_t /* minDist */,   uint32_t /* cannyThreshold */,
-                   uint32_t /* accThreshold */, uint32_t /* minRadius */, uint32_t /* maxRadius */> HoughCirclesPerfTestParams;
-typedef ::perf::TestBaseWithParam<HoughCirclesPerfTestParams> HoughCirclesPerf;
-
-// NOTE: test files should be manually loaded to folder on a device, for example like this:
-// adb push fastcv/misc/hough/ /sdcard/testdata/fastcv/hough/
-
-PERF_TEST_P(HoughCirclesPerf, run,
-                ::testing::Values(
-                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 10, 100 },
-                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 30, 100 },
-                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 50, 100 },
-                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 10, 100 },
-                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 30, 100 },
-                            HoughCirclesPerfTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 50, 100 }
-                         )
-           )
-{
-    auto p = GetParam();
-    std::string fname       = std::get<0>(p);
-    uint32_t minDist        = std::get<1>(p);
-    uint32_t cannyThreshold = std::get<2>(p);
-    uint32_t accThreshold   = std::get<3>(p);
-    uint32_t minRadius      = std::get<4>(p);
-    uint32_t maxRadius      = std::get<5>(p);
-
-    cv::Mat src = imread(cvtest::findDataFile(fname), cv::IMREAD_GRAYSCALE);
-    // make it aligned by 8
-    cv::Mat withBorder;
-    int bpix = ((src.cols & 0xfffffff8) + 8) - src.cols;
-    cv::copyMakeBorder(src, withBorder, 0, 0, 0, bpix, BORDER_REFLECT101);
-    src = withBorder;
-
-    while(next())
-    {
-        Mat icircles;
-        startTimer();
-        cv::fastcv::houghCircles(src, icircles, minDist,
-                                 cannyThreshold, accThreshold,
-                                 minRadius, maxRadius);
-        stopTimer();
-    }
-
-    SANITY_CHECK_NOTHING();
-}
-
 } // namespace
diff --git a/modules/fastcv/src/hough.cpp b/modules/fastcv/src/hough.cpp
index e46f64ac3f1..248f6b3517a 100644
--- a/modules/fastcv/src/hough.cpp
+++ b/modules/fastcv/src/hough.cpp
@@ -31,38 +31,5 @@ void houghLines(InputArray _src, OutputArray _lines, double threshold)
     lines(Range::all(), Range(0, nLines)).copyTo(_lines);
 }
 
-
-void houghCircles(InputArray _src, OutputArray _circles, uint32_t minDist,
-                  uint32_t cannyThreshold, uint32_t accThreshold,
-                  uint32_t minRadius, uint32_t maxRadius)
-{
-    INITIALIZATION_CHECK;
-    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
-    CV_Assert(_src.step() % 8 == 0);
-
-    Mat src = _src.getMat();
-
-    CV_Assert((size_t)(src.data) % 16 == 0);
-
-    const uint32_t maxCircles = 16384;
-
-    Mat circles(1, maxCircles, CV_32SC3);
-
-    uint32_t nCircles = maxCircles;
-
-    AutoBuffer<uint8_t> tempBuf;
-    tempBuf.allocate(16 * src.step * src.rows);
-
-    CV_Assert((size_t)(tempBuf.data()) % 16 == 0);
-
-    fcvHoughCircleu8(src.data, src.cols, src.rows, src.step,
-                     (fcvCircle*)circles.data, &nCircles, maxCircles,
-                     minDist, cannyThreshold, accThreshold,
-                     minRadius, maxRadius, tempBuf.data());
-
-    _circles.create(1, nCircles, CV_32SC3);
-    circles(Range::all(), Range(0, nCircles)).copyTo(_circles);
-}
-
 } // fastcv::
 } // cv::
diff --git a/modules/fastcv/test/test_hough.cpp b/modules/fastcv/test/test_hough.cpp
index 4713ae6df39..31bfca6430c 100644
--- a/modules/fastcv/test/test_hough.cpp
+++ b/modules/fastcv/test/test_hough.cpp
@@ -102,98 +102,4 @@ INSTANTIATE_TEST_CASE_P(FastCV_Extension, HoughLinesTest,
                                            ::testing::Values(0.05, 0.25, 0.5, 0.75) // threshold
                                            ));
 
-
-typedef std::tuple<std::string /* file name */, uint32_t /* minDist */,   uint32_t /* cannyThreshold */,
-                   uint32_t /* accThreshold */, uint32_t /* minRadius */, uint32_t /* maxRadius */> HoughCirclesTestParams;
-class HoughCirclesTest : public ::testing::TestWithParam<HoughCirclesTestParams> {};
-
-TEST_P(HoughCirclesTest, accuracy)
-{
-    auto p = GetParam();
-    std::string fname       = std::get<0>(p);
-    uint32_t minDist        = std::get<1>(p);
-    uint32_t cannyThreshold = std::get<2>(p);
-    uint32_t accThreshold   = std::get<3>(p);
-    uint32_t minRadius      = std::get<4>(p);
-    uint32_t maxRadius      = std::get<5>(p);
-
-    cv::Mat src = imread(cvtest::findDataFile(fname), cv::IMREAD_GRAYSCALE);
-    // make it aligned by 8
-    cv::Mat withBorder;
-    int bpix = ((src.cols & 0xfffffff8) + 8) - src.cols;
-    cv::copyMakeBorder(src, withBorder, 0, 0, 0, bpix, BORDER_REFLECT101);
-    src = withBorder;
-
-    std::vector<cv::Vec3f> refCircles;
-    cv::HoughCircles(src, refCircles, HOUGH_GRADIENT, 1.5, minDist,
-                     cannyThreshold, accThreshold,
-                     minRadius, maxRadius);
-
-    Mat icircles;
-    cv::fastcv::houghCircles(src, icircles, minDist,
-                             cannyThreshold, accThreshold,
-                             minRadius, maxRadius);
-
-    std::vector<cv::Vec3f> circles;
-    icircles.convertTo(circles, CV_32FC3);
-
-    // usually the number of detected circles is small, brute force is OK
-    float totalDist = 0;
-    for (size_t i = 0; i < circles.size(); i++)
-    {
-        cv::Vec3f c = circles[i];
-        float dist = std::numeric_limits<float>::max();
-        for (size_t j = 0; j < refCircles.size(); j++)
-        {
-            cv::Vec3f rc = refCircles[i];
-            float d = (rc - c).ddot(rc - c);
-            if (d < dist)
-            {
-                dist = d;
-            }
-        }
-        totalDist += dist;
-    }
-    totalDist = std::sqrt(totalDist);
-
-    EXPECT_LT(totalDist, 811.0);
-
-    if (cvtest::debugLevel > 0)
-    {
-        cv::Mat draw;
-        cvtColor(src, draw, COLOR_GRAY2BGR);
-        cv::Mat refDraw = draw.clone();
-        for (const cv::Vec3f& c : refCircles)
-        {
-            cv::Point center(c[0], c[1]);
-            cv::circle(refDraw, center, c[2], Scalar(0, 255, 0));
-        }
-        for (const cv::Vec3f& c : circles)
-        {
-            cv::Point center(c[0], c[1]);
-            cv::circle(draw, center, c[2], Scalar(0, 255, 0));
-        }
-        std::cout << "circles: " << circles.size() << std::endl;
-        size_t idx = fname.find_last_of("/\\");
-        std::string fout = fname.substr(idx+1, fname.length() - idx - 5);
-        cv::imwrite(cv::format("circle_%s_mdt%d_can%d_acc%d_rf%d_rt%d_ref.png", fout.c_str(),
-                               minDist, cannyThreshold, accThreshold, minRadius, maxRadius), refDraw);
-        cv::imwrite(cv::format("circle_%s_mdt%d_can%d_acc%d_rf%d_rt%d_fcv.png", fout.c_str(),
-                               minDist, cannyThreshold, accThreshold, minRadius, maxRadius), draw);
-    }
-}
-
-// NOTE: test files should be manually loaded to folder on a device, for example like this:
-// adb push fastcv/misc/hough/ /sdcard/testdata/fastcv/hough/
-INSTANTIATE_TEST_CASE_P(FastCV_Extension, HoughCirclesTest,
-                        ::testing::Values(
-                            // gpu/connectedcomponents/concentric_circles.png
-                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 10, 100 },
-                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 30, 100 },
-                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png", 100, 100, 50, 50, 100 },
-                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 10, 100 },
-                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 30, 100 },
-                            HoughCirclesTestParams {"cv/cameracalibration/circles/circles4.png",  10, 100, 50, 50, 100 }
-                         ));
-
 }} // namespaces opencv_test, ::

From 7554d15e72f9c9c8265d716621fb4012db5dbf1e Mon Sep 17 00:00:00 2001
From: Xue Zhang <quic_xuezha@quicinc.com>
Date: Wed, 11 Dec 2024 16:15:13 +0530
Subject: [PATCH 06/11] fix warnings

---
 modules/fastcv/src/blur.cpp | 52 ++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/modules/fastcv/src/blur.cpp b/modules/fastcv/src/blur.cpp
index f092da96e0a..7c2efc06268 100644
--- a/modules/fastcv/src/blur.cpp
+++ b/modules/fastcv/src/blur.cpp
@@ -38,22 +38,22 @@ class FcvGaussianBlurLoop_Invoker : public cv::ParallelLoopBody
         }
 
         const uchar* src = src_data + (range.start-topLines)*src_step;
-        uchar dst[dst_step*rangeHeight];
+        std::vector<uchar> dst(dst_step*rangeHeight);
 
         if (fcvFuncType == FCV_MAKETYPE(3,CV_8U))
-            fcvFilterGaussian3x3u8_v4(src, width, rangeHeight, src_step, dst, dst_step, fcvBorder, 0);
+            fcvFilterGaussian3x3u8_v4(src, width, rangeHeight, src_step, dst.data(), dst_step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(5,CV_8U))
-            fcvFilterGaussian5x5u8_v3(src, width, rangeHeight, src_step, dst, dst_step, fcvBorder, 0);
+            fcvFilterGaussian5x5u8_v3(src, width, rangeHeight, src_step, dst.data(), dst_step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(5,CV_16S))
-            fcvFilterGaussian5x5s16_v3((int16_t*)src, width, rangeHeight, src_step, (int16_t*)dst, dst_step, fcvBorder, 0);
+            fcvFilterGaussian5x5s16_v3((int16_t*)src, width, rangeHeight, src_step, (int16_t*)dst.data(), dst_step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(5,CV_32S))
-            fcvFilterGaussian5x5s32_v3((int32_t*)src, width, rangeHeight, src_step, (int32_t*)dst, dst_step, fcvBorder, 0);
+            fcvFilterGaussian5x5s32_v3((int32_t*)src, width, rangeHeight, src_step, (int32_t*)dst.data(), dst_step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(11,CV_8U))
-            fcvFilterGaussian11x11u8_v2(src, width, rangeHeight, src_step, dst, dst_step, fcvBorder);
+            fcvFilterGaussian11x11u8_v2(src, width, rangeHeight, src_step, dst.data(), dst_step, fcvBorder);
 
-        uchar* dptr = dst_data+range.start*dst_step;
-        uchar* sptr = dst+topLines*dst_step;
-        memcpy(dptr,sptr, (range.end-range.start)*dst_step);
+        uchar *dptr = dst_data + range.start * dst_step;
+        uchar *sptr = dst.data() + topLines * dst_step;
+        memcpy(dptr, sptr, (range.end - range.start) * dst_step);
     }
 
     private:
@@ -132,17 +132,17 @@ class FcvFilter2DLoop_Invoker : public cv::ParallelLoopBody
         }
 
         const uchar *src = src_data + (range.start - topLines) * src_step;
-        uchar dst[dst_step*rangeHeight];
+        std::vector<uchar> dst(dst_step*rangeHeight);
 
         if (ddepth == CV_8U)
-            fcvFilterCorrNxNu8((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, dst, dst_step);
+            fcvFilterCorrNxNu8((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, dst.data(), dst_step);
         else if (ddepth == CV_16S)
-            fcvFilterCorrNxNu8s16((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, (int16_t*)dst, dst_step);
+            fcvFilterCorrNxNu8s16((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, (int16_t*)dst.data(), dst_step);
         else if (ddepth == CV_32F)
-            fcvFilterCorrNxNu8f32((float32_t*)kernel, ksize, src, width, rangeHeight, src_step, (float32_t*)dst, dst_step);
+            fcvFilterCorrNxNu8f32((float32_t*)kernel, ksize, src, width, rangeHeight, src_step, (float32_t*)dst.data(), dst_step);
 
-        uchar* dptr = dst_data+range.start*dst_step;
-        uchar* sptr = dst+topLines*dst_step;
+        uchar *dptr = dst_data + range.start * dst_step;
+        uchar *sptr = dst.data() + topLines * dst_step;
         memcpy(dptr, sptr, (range.end - range.start) * dst_step);
     }
 
@@ -237,56 +237,56 @@ class FcvSepFilter2DLoop_Invoker : public cv::ParallelLoopBody
         }
 
         const uchar *src = src_data + (range.start - topLines) * src_step;
-        uchar dst[dst_step*rangeHeight];
+        std::vector<uchar> dst(dst_step*rangeHeight);
 
         switch (ddepth)
         {
             case CV_8U:
             {
                 fcvFilterCorrSepMxNu8((int8_t*)kernelX, kernelXSize, (int8_t*)kernelY, kernelYSize, 0, src, width, rangeHeight,
-                    src_step, dst, dst_step);
+                    src_step, dst.data(), dst_step);
                 break;
             }
             case CV_16S:
             {
-                int16_t tmpImage[width*(rangeHeight+kernelXSize-1)];
+                std::vector<int16_t> tmpImage(width*(rangeHeight+kernelXSize-1));
                 switch (kernelXSize)
                 {
                     case 9:
                     {
                         fcvFilterCorrSep9x9s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage, (int16_t*)dst, dst_step);
+                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
                         break;
                     }
                     case 11:
                     {
                         fcvFilterCorrSep11x11s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage, (int16_t*)dst, dst_step);
+                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
                         break;
                     }
                     case 13:
                     {
                         fcvFilterCorrSep13x13s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage, (int16_t*)dst, dst_step);
+                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
                         break;
                     }
                     case 15:
                     {
                         fcvFilterCorrSep15x15s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage, (int16_t*)dst, dst_step);
+                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
                         break;
                     }
                     case 17:
                     {
                         fcvFilterCorrSep17x17s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage, (int16_t*)dst, dst_step);
+                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
                         break;
                     }
 
                     default:
                     {
                         fcvFilterCorrSepNxNs16((int16_t*)kernelX, kernelXSize, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage, (int16_t*)dst, dst_step);
+                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
                         break;
                     }
                 }
@@ -299,8 +299,8 @@ class FcvSepFilter2DLoop_Invoker : public cv::ParallelLoopBody
             }
         }
 
-        uchar* dptr = dst_data+range.start*dst_step;
-        uchar* sptr = dst+topLines*dst_step;
+        uchar *dptr = dst_data + range.start * dst_step;
+        uchar *sptr = dst.data() + topLines * dst_step;
         memcpy(dptr, sptr, (range.end - range.start) * dst_step);
     }
 

From b6f67e162a4b319e39aed8163571ccd1e2eb168b Mon Sep 17 00:00:00 2001
From: xuezha <quic_xuezha@quicinc.com>
Date: Fri, 13 Dec 2024 07:54:50 +0530
Subject: [PATCH 07/11] remove some moments changes

---
 modules/fastcv/src/moments.cpp       | 46 ++++++++++++++++------------
 modules/fastcv/test/test_moments.cpp | 15 ++-------
 2 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/modules/fastcv/src/moments.cpp b/modules/fastcv/src/moments.cpp
index e40c85a1bc8..3a0c4249eef 100644
--- a/modules/fastcv/src/moments.cpp
+++ b/modules/fastcv/src/moments.cpp
@@ -20,30 +20,36 @@ cv::Moments moments(InputArray _src, bool binary)
     Mat src = _src.getMat();
 
     cv::Moments m;
-    fcvMoments mFCV;
+	if( size.width == 0 || size.height == 0 )
+        return m;
+
+	fcvMoments* mFCV = new fcvMoments();
     fcvStatus status = FASTCV_SUCCESS;
 	if(binary)
     {
-        cv::Mat src_binary(size, CV_8UC1);
-        cv::compare( src, 0, src_binary, cv::CMP_NE );
-        fcvImageMomentsu8(src_binary.data, src_binary.cols,
-                        src_binary.rows, src_binary.step, &mFCV, binary);
-    }
-    else
-    {
-        switch(type)
-        {
-            case CV_8UC1:
-                fcvImageMomentsu8(src.data, src.cols, src.rows, src.step[0], &mFCV, binary);
-                break;
-            case CV_32SC1:
-                fcvImageMomentss32(src.ptr<int>(), src.cols, src.rows, src.step[0], &mFCV, binary);
-                break;
-            case CV_32FC1:
-                fcvImageMomentsf32(src.ptr<float>(), src.cols, src.rows, src.step[0], &mFCV, binary);
-                break;
-        }
+		cv::Mat src_binary(size, CV_8UC1);
+		cv::compare( src, 0, src_binary, cv::CMP_NE );
+		fcvImageMomentsu8(src_binary.data, src_binary.cols,
+		                  src_binary.rows, src_binary.step, mFCV, binary);
     }
+	else
+	{
+		switch(type)
+		{
+			case CV_8UC1:
+			    fcvImageMomentsu8(src.data, src.cols, src.rows,
+				                  src.step, mFCV, binary);
+				break;
+			case CV_32SC1:
+			    fcvImageMomentss32((const int*)src.data, src.cols, src.rows,
+				                  src.step, mFCV, binary);
+				break;
+			case CV_32FC1:
+			    fcvImageMomentsf32((const float*)src.data, src.cols, src.rows,
+				                  src.step, mFCV, binary);
+				break;
+		}
+	}
 
 	if (status != FASTCV_SUCCESS)
     {
diff --git a/modules/fastcv/test/test_moments.cpp b/modules/fastcv/test/test_moments.cpp
index 50bbc812554..d4ef89f98db 100644
--- a/modules/fastcv/test/test_moments.cpp
+++ b/modules/fastcv/test/test_moments.cpp
@@ -29,19 +29,8 @@ TEST_P(fcv_momentsTest, accuracy)
 
 	cv::Moments m = cv::fastcv::moments(src, binaryImage);
 
-    cv::Scalar mean_val, stdDev;
-    float mean_val_fcv = m.m00/(srcSize.width * srcSize.height);
-    if(binaryImage)
-    {
-        cv::Mat src_binary(srcSize, CV_8UC1);
-        cv::compare( src, 0, src_binary, cv::CMP_NE );
-        mean_val = cv::mean(src_binary);
-        mean_val_fcv *= 255;
-    }
-    else
-        mean_val = cv::mean(src);
-
-    EXPECT_NEAR(mean_val[0], mean_val_fcv, 2);
+    int len_m = sizeof(m)/sizeof(m.m00);
+    EXPECT_FALSE(len_m != 24);
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, fcv_momentsTest, Combine(

From a749bd48683c7dfa1f7f793772e9501c19890606 Mon Sep 17 00:00:00 2001
From: Xue Zhang <quic_xuezha@quicinc.com>
Date: Fri, 13 Dec 2024 11:33:06 +0530
Subject: [PATCH 08/11] Add copyright and updated Description

---
 .../fastcv/include/opencv2/fastcv/arithm.hpp  |  3 +-
 .../fastcv/include/opencv2/fastcv/blur.hpp    | 19 +++++++++----
 .../fastcv/include/opencv2/fastcv/cluster.hpp |  3 +-
 .../fastcv/include/opencv2/fastcv/draw.hpp    |  2 +-
 .../fastcv/include/opencv2/fastcv/edges.hpp   | 28 +++++++++++++------
 .../fastcv/include/opencv2/fastcv/fast10.hpp  |  7 +++--
 modules/fastcv/include/opencv2/fastcv/fft.hpp |  2 +-
 .../include/opencv2/fastcv/ipptransform.hpp   |  5 ++--
 .../fastcv/include/opencv2/fastcv/pyramid.hpp |  5 ++--
 .../fastcv/include/opencv2/fastcv/scale.hpp   |  2 ++
 .../fastcv/include/opencv2/fastcv/shift.hpp   |  7 +++--
 .../fastcv/include/opencv2/fastcv/smooth.hpp  |  1 +
 .../fastcv/include/opencv2/fastcv/thresh.hpp  |  2 +-
 .../include/opencv2/fastcv/tracking.hpp       |  3 +-
 .../fastcv/include/opencv2/fastcv/warp.hpp    | 18 ++++++++----
 modules/fastcv/perf/perf_fft_dct.cpp          | 11 +-------
 modules/fastcv/src/edges.cpp                  |  6 +++-
 modules/fastcv/src/pyramid.cpp                |  5 +++-
 modules/fastcv/src/tracking.cpp               |  5 +++-
 modules/fastcv/src/warp.cpp                   |  5 ++++
 modules/fastcv/test/test_scale.cpp            |  4 +--
 modules/fastcv/test/test_warp.cpp             |  8 ++++--
 22 files changed, 99 insertions(+), 52 deletions(-)

diff --git a/modules/fastcv/include/opencv2/fastcv/arithm.hpp b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
index e479d970b1d..5a0c43b2408 100644
--- a/modules/fastcv/include/opencv2/fastcv/arithm.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Matrix multiplication of two int8_t type matrices
-
+ *		  uses signed integer input/output whereas cv::gemm uses floating point input/output
+ *        matmuls8s32 provides enhanced speed on Qualcomm's processors
  * @param src1 First source matrix of type CV_8S
  * @param src2 Second source matrix of type CV_8S
  * @param dst Resulting matrix of type CV_32S
diff --git a/modules/fastcv/include/opencv2/fastcv/blur.hpp b/modules/fastcv/include/opencv2/fastcv/blur.hpp
index 424a11fa53b..9fb59156dc5 100644
--- a/modules/fastcv/include/opencv2/fastcv/blur.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/blur.hpp
@@ -1,3 +1,8 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
 #ifndef OPENCV_FASTCV_BLUR_HPP
 #define OPENCV_FASTCV_BLUR_HPP
 
@@ -14,18 +19,20 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Gaussian blur with sigma = 0 and square kernel size
+ * @brief Gaussian blur with sigma = 0 and square kernel size. The way of handling borders is different with cv::GaussianBlur,
+ *        leading to slight variations in the output.
  * @param _src Intput image with type CV_8UC1
  * @param _dst Output image with type CV_8UC1
  * @param kernel_size Filer kernel size. One of 3, 5, 11
- * @param blur_border Blur border or not
+ * @param blur_border If set to true, border is blurred by 0-padding adjacent values.(A variant of the constant border)
+ *                    If set to false, borders up to half-kernel width are ignored (e.g. 1 pixel in the 3x3 case).
  *
  * @sa GaussianBlur
  */
 CV_EXPORTS_W void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int kernel_size = 3, bool blur_border = true);
 
 /**
- * @brief Filter an image with non-separable kernel
+ * @brief NxN correlation with non-separable kernel. Borders up to half-kernel width are ignored
  * @param _src Intput image with type CV_8UC1
  * @param _dst Output image with type CV_8UC1, CV_16SC1 or CV_32FC1
  * @param ddepth The depth of output image
@@ -36,8 +43,10 @@ CV_EXPORTS_W void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int ke
 CV_EXPORTS_W void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel);
 
 /**
- * @brief sepFilter an image with separable kernel.The way of handling overflow is different with OpenCV, this function will
- * do right shift for the intermediate results and final result.
+ * @brief NxN correlation with separable kernel. If srcImg and dstImg point to the same address and srcStride equals to dstStride,
+ *        it will do in-place. Borders up to half-kernel width are ignored.
+ *        The way of handling overflow is different with OpenCV, this function will do right shift for
+ *        the intermediate results and final result.
  * @param _src Intput image with type CV_8UC1
  * @param _dst Output image with type CV_8UC1, CV_16SC1
  * @param ddepth The depth of output image
diff --git a/modules/fastcv/include/opencv2/fastcv/cluster.hpp b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
index 65f4540862e..46ac7ad103d 100644
--- a/modules/fastcv/include/opencv2/fastcv/cluster.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Clusterizes N input points in D-dimensional space into K clusters
- *
+ *        Accepts 8-bit unsigned integer points
+ *        Provides faster execution time than cv::kmeans on Qualcomm's processors
  * @param points            Points array of type 8u, each row represets a point.
  *                          Size is N rows by D columns, can be non-continuous.
  * @param clusterCenters    Initial cluster centers array of type 32f, each row represents a center.
diff --git a/modules/fastcv/include/opencv2/fastcv/draw.hpp b/modules/fastcv/include/opencv2/fastcv/draw.hpp
index baa2b58c930..1abb5f55080 100644
--- a/modules/fastcv/include/opencv2/fastcv/draw.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/draw.hpp
@@ -17,7 +17,7 @@ namespace fastcv {
 /**
  * @brief Draw convex polygon
           This function fills the interior of a convex polygon with the specified color.
-
+          Requires the width and stride to be multple of 8.
  * @param img Image to draw on. Should have up to 4 8-bit channels
  * @param pts Array of polygon points coordinates. Should contain N two-channel or 2*N one-channel 32-bit integer elements
  * @param color Color of drawn polygon stored as B,G,R and A(if supported)
diff --git a/modules/fastcv/include/opencv2/fastcv/edges.hpp b/modules/fastcv/include/opencv2/fastcv/edges.hpp
index c8d67e9741e..7a947e80bff 100644
--- a/modules/fastcv/include/opencv2/fastcv/edges.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/edges.hpp
@@ -1,5 +1,11 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
 #ifndef OPENCV_EDGES_HPP
 #define OPENCV_EDGES_HPP
+
 #include "opencv2/core/mat.hpp"
 
 namespace cv {
@@ -12,23 +18,29 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Sobel filter with return dx and dy separately
+ * @brief Creates a 2D gradient image from source luminance data without normalization.
+ *        Calculate X direction 1 order derivative or Y direction 1 order derivative or both at the same time, .
  * @param _src          Input image with type CV_8UC1
- * @param _dx           X direction 1 order derivative with type CV_16SC1.
- * @param _dy           Y direction 1 order derivative with type CV_16SC1 (same size with _dx).
+ * @param _dx           Buffer to store horizontal gradient. Must be (dxyStride)*(height) bytes in size.
+ *                      If NULL, the horizontal gradient will not be calculated.
+ * @param _dy           Buffer to store vertical gradient. Must be (dxyStride)*(height) bytes in size.
+ *                      If NULL, the vertical gradient will not be calculated
  * @param kernel_size   Sobel kernel size, support 3x3, 5x5, 7x7
- * @param borderType    Border type
+ * @param borderType    Border type, support BORDER_CONSTANT, BORDER_REPLICATE
  * @param borderValue   Border value for constant border
 */
 CV_EXPORTS_W void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArray _dy, int kernel_size, int borderType,
     int borderValue);
 
 /**
- * @brief 3x3 Sobel filter without border
+ * @brief Creates a 2D gradient image from source luminance data without normalization.
+ *        This function computes central differences on 3x3 neighborhood and then convolves the result with Sobel kernel,
+ *        borders up to half-kernel width are ignored.
  * @param _src          Input image with type CV_8UC1
- * @param _dst          If _dsty is not needed, will store 8-bit result of |dx|+|dy|,
- *                      otherwise will store the result of X direction 1 order derivative
- * @param _dsty         If this param is needed, will store the result of Y direction 1 order derivative
+ * @param _dst          If _dsty is given, buffer to store horizontal gradient, otherwise, output 8-bit image of |dx|+|dy|.
+ *                      Size of buffer is (srcwidth)*(srcheight) bytes
+ * @param _dsty         (Optional)Buffer to store vertical gradient. Must be (srcwidth)*(srcheight) in size.
+ * @param ddepth        The depth of output image CV_8SC1,CV_16SC1,CV_32FC1,
  * @param normalization If do normalization for the result
 */
 CV_EXPORTS_W void sobel3x3u8(cv::InputArray _src, cv::OutputArray _dst, cv::OutputArray _dsty = noArray(), int ddepth = CV_8U,
diff --git a/modules/fastcv/include/opencv2/fastcv/fast10.hpp b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
index 1d97e9d0df7..1dd15ac198c 100644
--- a/modules/fastcv/include/opencv2/fastcv/fast10.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
@@ -15,9 +15,10 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Extracts FAST corners and scores from the image based on the mask.
-          The mask specifies pixels to be ignored by the detector
-
+ * @brief Extracts FAST10 corners and scores from the image based on the mask.
+ *        The mask specifies pixels to be ignored by the detector
+ *        designed for corner detection on Qualcomm's processors, provides enhanced speed.
+ *
  * @param src 8-bit grayscale image
  * @param mask Optional mask indicating which pixels should be omited from corner dection.
                Its size should be k times image width and height, where k = 1/2, 1/4 , 1/8 , 1, 2, 4 and 8
diff --git a/modules/fastcv/include/opencv2/fastcv/fft.hpp b/modules/fastcv/include/opencv2/fastcv/fft.hpp
index 88901a6a4f8..1aef585035b 100644
--- a/modules/fastcv/include/opencv2/fastcv/fft.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/fft.hpp
@@ -18,7 +18,7 @@ namespace fastcv {
  * @brief Computes the 1D or 2D Fast Fourier Transform of a real valued matrix.
           For the 2D case, the width and height of the input and output matrix must be powers of 2.
           For the 1D case, the height of the matrices must be 1, while the width must be a power of 2.
-
+          Accepts 8-bit unsigned integer array, whereas cv::dft accepts floating-point or complex array.
  * @param src Input array of CV_8UC1. The dimensions of the matrix must be powers of 2 for the 2D case,
               and in the 1D case, the height must be 1, while the width must be a power of 2.
  * @param dst The computed FFT matrix of type CV_32FC2. The FFT Re and Im coefficients are stored in different channels.
diff --git a/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
index cba87d69af7..42c8c94ea78 100644
--- a/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief This function performs 8x8 forward discrete Cosine transform on input image
- * 
+ * 		  accepts input of type 8-bit unsigned integer and produces output of type 16-bit signed integer
+ *		  provides faster execution time than cv::dct on Qualcomm's processor
  * @param src Input image of type CV_8UC1
  * @param dst Output image of type CV_16SC1
  */
@@ -24,7 +25,7 @@ CV_EXPORTS_W void DCT(InputArray src, OutputArray dst);
 
 /**
  * @brief This function performs 8x8 inverse discrete Cosine transform on input image
- *
+ * provides faster execution time than cv::dct in inverse case on Qualcomm's processor
  * @param src Input image of type CV_16SC1
  * @param dst Output image of type CV_8UC1
  */
diff --git a/modules/fastcv/include/opencv2/fastcv/pyramid.hpp b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
index 2e7a89e98f6..6c20a21ab78 100644
--- a/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
@@ -27,7 +27,7 @@ CV_EXPORTS_W void sobelPyramid(InputArrayOfArrays pyr, OutputArrayOfArrays dx, O
 /**
  * @brief Builds an image pyramid of float32 arising from a single
     original image - that are successively downscaled w.r.t. the
-    pre-set levels.
+    pre-set levels. This API supports both ORB scaling and scale down by half. 
  *
  * @param src Input single-channel image of type 8U or 32F
  * @param pyr Output array containing nLevels downscaled image copies
@@ -36,7 +36,8 @@ CV_EXPORTS_W void sobelPyramid(InputArrayOfArrays pyr, OutputArrayOfArrays dx, O
  *                 ORB scaling is not supported for float point images
  * @param borderType how to process border, the options are BORDER_REFLECT (maps to FASTCV_BORDER_REFLECT),
  *                   BORDER_REFLECT_101 (maps to FASTCV_BORDER_REFLECT_V2) and BORDER_REPLICATE (maps to FASTCV_BORDER_REPLICATE).
- *                   Other border types are mapped to FASTCV_BORDER_UNDEFINED. Ignored for float point images
+ *                   Other border types are mapped to FASTCV_BORDER_UNDEFINED(border pixels are ignored). Currently, borders only
+ *                   supported for downscaling by half, ignored for ORB scaling. Also ignored for float point images
  * @param borderValue what value should be used to fill border, ignored for float point images
  */
 CV_EXPORTS_W void buildPyramid(InputArray src, OutputArrayOfArrays pyr, int nLevels, bool scaleBy2 = true,
diff --git a/modules/fastcv/include/opencv2/fastcv/scale.hpp b/modules/fastcv/include/opencv2/fastcv/scale.hpp
index 8d7d084ac24..276b2304050 100644
--- a/modules/fastcv/include/opencv2/fastcv/scale.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/scale.hpp
@@ -16,6 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Down-scale the image by averaging each 2x2 pixel block.
+ * 		  This function is not bit-exact with cv::resize but provides faster execution time on Qualcomm's processor.
  * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 2
  * @param _dst The output image data, type CV_8UC1
 */
@@ -23,6 +24,7 @@ CV_EXPORTS_W void resizeDownBy2(cv::InputArray _src, cv::OutputArray _dst);
 
 /**
  * @brief Down-scale the image by averaging each 4x4 pixel block.
+ * 		  This function is not bit-exact with cv::resize but provides faster execution time on Qualcomm's processor.
  * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 4
  * @param _dst The output image data, type CV_8UC1
 */
diff --git a/modules/fastcv/include/opencv2/fastcv/shift.hpp b/modules/fastcv/include/opencv2/fastcv/shift.hpp
index a545789f199..3ca2c22f2fc 100644
--- a/modules/fastcv/include/opencv2/fastcv/shift.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/shift.hpp
@@ -18,9 +18,12 @@ namespace fastcv {
  * @brief Applies the meanshift procedure and obtains the final converged position.
           This function applies the meanshift procedure to an original image (usually a probability image)
           and obtains the final converged position. The converged position search will stop either it has reached
-          the required accuracy or the maximum number of iterations.
+          the required accuracy or the maximum number of iterations. Moments used in the algorithm are calculated
+          in floating point.
+          This function isn't bit-exact with cv::meanShift but provides improved latency on Snapdragon processors.
 
- * @param src 8-bit grayscale image which is usually a probability image computed based on object histogram
+ * @param src 8-bit, 32-bit int or 32-bit float grayscale image which is usually a probability image
+ *            computed based on object histogram
  * @param rect Initial search window position which also returns the final converged window position
  * @param termCrit The criteria used to finish the MeanShift which consists of two termination criteria:
  *                 1) epsilon: required accuracy; 2) max_iter: maximum number of iterations
diff --git a/modules/fastcv/include/opencv2/fastcv/smooth.hpp b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
index a3cee45a3ce..2127ae5a23d 100644
--- a/modules/fastcv/include/opencv2/fastcv/smooth.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
@@ -20,6 +20,7 @@ namespace fastcv {
 Different from traditional bilateral filtering, here the smoothing is actually performed in gradient domain.
 The algorithm claims that it's more efficient than the original bilateral filtering in both image quality and computation.
 See algorithm description in the paper Recursive Bilateral Filtering, ECCV2012 by Prof Yang Qingxiong
+This function isn't bit-exact with cv::bilateralFilter but provides improved latency on Snapdragon processors.
  * @param src Input image, should have one CV_8U channel
  * @param dst Output array having one CV_8U channel
  * @param sigmaColor Sigma in the color space, the bigger the value the more color difference is smoothed by the algorithm
diff --git a/modules/fastcv/include/opencv2/fastcv/thresh.hpp b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
index 878761d75d5..418f98a012d 100644
--- a/modules/fastcv/include/opencv2/fastcv/thresh.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
@@ -17,7 +17,7 @@ namespace fastcv {
 /**
  * @brief Binarizes a grayscale image based on a pair of threshold values. The binarized image will be in the two values
  *        selected by user
-
+ *        this function provides improved latency on Snapdragon processor.
  * @param src 8-bit grayscale image
  * @param dst Output image of the same size and type as input image, can be the same as input image
  * @param lowThresh The lower threshold value for binarization
diff --git a/modules/fastcv/include/opencv2/fastcv/tracking.hpp b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
index 95b9ab2466e..9cca92c1239 100644
--- a/modules/fastcv/include/opencv2/fastcv/tracking.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Calculates sparse optical flow using Lucas-Kanade algorithm
- *
+ *		  accepts 8-bit unsigned integer image
+ *		  Provides faster execution time on Qualcomm's processor 
  * @param src Input single-channel image of type 8U, initial motion frame
  * @param dst Input single-channel image of type 8U, final motion frame, should have the same size and stride as initial frame
  * @param srcPyr Pyramid built from intial motion frame
diff --git a/modules/fastcv/include/opencv2/fastcv/warp.hpp b/modules/fastcv/include/opencv2/fastcv/warp.hpp
index 6704751c4cd..26a8be9f303 100644
--- a/modules/fastcv/include/opencv2/fastcv/warp.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/warp.hpp
@@ -1,6 +1,11 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
 #ifndef OPENCV_WARP_HPP
 #define OPENCV_WARP_HPP
-#include "opencv2/core/mat.hpp"
+
 #include <opencv2/imgproc.hpp>
 namespace cv {
 namespace fastcv {
@@ -13,11 +18,12 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Perspective warp two images using the same transformation. Bi-linear interpolation is used where applicable
- * @param _src1     The first input image data, type CV_8UC1
- * @param _src2     The second input image data, type CV_8UC1
- * @param _dst1     The first output image data, type CV_8UC1
- * @param _dst2     The second output image data, type CV_8UC1
+ * @brief Perspective warp two images using the same transformation. Bi-linear interpolation is used where applicable.
+ *        For example, to warp a grayscale image and an alpha image at the same time, or warp two color channels.
+ * @param _src1     First input 8-bit image. Size of buffer is src1Stride*srcHeight bytes.
+ * @param _src2     Second input 8-bit image. Size of buffer is src2Stride*srcHeight bytes.
+ * @param _dst1     First warped output image (correspond to src1). Size of buffer is dst1Stride*dstHeight bytes, type CV_8UC1
+ * @param _dst2     Second warped output image (correspond to src2). Size of buffer is dst2Stride*dstHeight bytes, type CV_8UC1
  * @param _M0       The 3x3 perspective transformation matrix (inversed map)
  * @param dsize     The output image size
 */
diff --git a/modules/fastcv/perf/perf_fft_dct.cpp b/modules/fastcv/perf/perf_fft_dct.cpp
index 829d2aaa766..30e4e68ce62 100644
--- a/modules/fastcv/perf/perf_fft_dct.cpp
+++ b/modules/fastcv/perf/perf_fft_dct.cpp
@@ -17,8 +17,6 @@ PERF_TEST_P_(FFTExtPerfTest, forward)
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
 
-    Mat srcFloat;
-    src.convertTo(srcFloat, CV_32F);
     Mat dst;
 
     while(next())
@@ -39,9 +37,6 @@ PERF_TEST_P_(FFTExtPerfTest, inverse)
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
 
-    Mat srcFloat;
-    src.convertTo(srcFloat, CV_32F);
-
     Mat fwd, back;
     cv::fastcv::FFT(src, fwd);
 
@@ -70,8 +65,7 @@ PERF_TEST_P_(DCTExtPerfTest, forward)
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
-    Mat srcFloat;
-    src.convertTo(srcFloat, CV_32F);
+
     Mat dst, ref;
 
     while(next())
@@ -92,9 +86,6 @@ PERF_TEST_P_(DCTExtPerfTest, inverse)
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
 
-    Mat srcFloat;
-    src.convertTo(srcFloat, CV_32F);
-
     Mat fwd, back;
     cv::fastcv::DCT(src, fwd);
 
diff --git a/modules/fastcv/src/edges.cpp b/modules/fastcv/src/edges.cpp
index 76bbc14e889..ad90b9e71ee 100644
--- a/modules/fastcv/src/edges.cpp
+++ b/modules/fastcv/src/edges.cpp
@@ -1,3 +1,8 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
 #include "precomp.hpp"
 
 namespace cv {
@@ -73,7 +78,6 @@ void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArray _dy, int ke
 
     switch (borderType)
     {
-        // For constant border, there are no border value, OpenCV default value is 0
         case cv::BorderTypes::BORDER_CONSTANT:
         {
             fcvBorder = fcvBorderType::FASTCV_BORDER_CONSTANT;
diff --git a/modules/fastcv/src/pyramid.cpp b/modules/fastcv/src/pyramid.cpp
index 24dd4928899..79a59086e51 100644
--- a/modules/fastcv/src/pyramid.cpp
+++ b/modules/fastcv/src/pyramid.cpp
@@ -1,4 +1,7 @@
-// License text goes here
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
 
 #include "precomp.hpp"
 
diff --git a/modules/fastcv/src/tracking.cpp b/modules/fastcv/src/tracking.cpp
index dee6b17ee55..778c73c323e 100644
--- a/modules/fastcv/src/tracking.cpp
+++ b/modules/fastcv/src/tracking.cpp
@@ -1,4 +1,7 @@
-// License text goes here
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
 
 #include "precomp.hpp"
 
diff --git a/modules/fastcv/src/warp.cpp b/modules/fastcv/src/warp.cpp
index 09cfc09e1aa..01f83bdf510 100644
--- a/modules/fastcv/src/warp.cpp
+++ b/modules/fastcv/src/warp.cpp
@@ -1,3 +1,8 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
 #include "precomp.hpp"
 
 namespace cv {
diff --git a/modules/fastcv/test/test_scale.cpp b/modules/fastcv/test/test_scale.cpp
index f98c9cc4ceb..b8e84218ed8 100644
--- a/modules/fastcv/test/test_scale.cpp
+++ b/modules/fastcv/test/test_scale.cpp
@@ -70,7 +70,7 @@ TEST_P(ResizeBy2Test, ResizeBy2) {
     EXPECT_EQ(resized_image.size().height, size.height * 0.5);
 }
 
-TEST_P(ResizeBy4Test, ResizeBy2) {
+TEST_P(ResizeBy4Test, ResizeBy4) {
 
     //Size size = get<0>(GetParam());
     Size size = GetParam();
@@ -80,7 +80,7 @@ TEST_P(ResizeBy4Test, ResizeBy2) {
     Size dsize;
     cv::Mat resized_image;
 
-    // Resize the image by a factor of 2
+    // Resize the image by a factor of 4
     cv::fastcv::resizeDownBy4(inputImage, resized_image);
 
     // Check if the output size is correct
diff --git a/modules/fastcv/test/test_warp.cpp b/modules/fastcv/test/test_warp.cpp
index 38ce2752f60..240262f93ca 100644
--- a/modules/fastcv/test/test_warp.cpp
+++ b/modules/fastcv/test/test_warp.cpp
@@ -1,3 +1,8 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
 #include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
@@ -54,9 +59,6 @@ TEST_P(WarpPerspective2Plane, accuracy)
     cv::threshold(difference2, mask2, 5, 255, cv::THRESH_BINARY);
     int num_diff_pixels_1 = cv::countNonZero(mask1);
     int num_diff_pixels_2 = cv::countNonZero(mask2);
-    // imwrite(format("/tmp/image/warp_dst_%dx%d.png",dstSize.width,dstSize.height),dst1);
-    // imwrite(format("/tmp/image/warp_ref_%dx%d.png",dstSize.width,dstSize.height),ref1);
-    // imwrite(format("/tmp/image/warp_diff_%dx%d.png",dstSize.width,dstSize.height),difference_1);
 
     EXPECT_LT(num_diff_pixels_1, src.size().area()*0.02);
     EXPECT_LT(num_diff_pixels_2, src.size().area()*0.02);

From d0dc5f5091ba111565a9c2abd6699194f9ecefa1 Mon Sep 17 00:00:00 2001
From: Xue Zhang <quic_xuezha@quicinc.com>
Date: Mon, 16 Dec 2024 13:19:49 +0530
Subject: [PATCH 09/11] remove memcpy

---
 .../fastcv/include/opencv2/fastcv/blur.hpp    |   2 +-
 .../fastcv/include/opencv2/fastcv/edges.hpp   |   5 +-
 .../fastcv/include/opencv2/fastcv/warp.hpp    |   4 +-
 modules/fastcv/src/blur.cpp                   | 275 +++++++++---------
 4 files changed, 141 insertions(+), 145 deletions(-)

diff --git a/modules/fastcv/include/opencv2/fastcv/blur.hpp b/modules/fastcv/include/opencv2/fastcv/blur.hpp
index 9fb59156dc5..99d1cd3d655 100644
--- a/modules/fastcv/include/opencv2/fastcv/blur.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/blur.hpp
@@ -29,7 +29,7 @@ namespace fastcv {
  *
  * @sa GaussianBlur
  */
-CV_EXPORTS_W void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int kernel_size = 3, bool blur_border = true);
+CV_EXPORTS_W void gaussianBlur(InputArray _src, OutputArray _dst, int kernel_size = 3, bool blur_border = true);
 
 /**
  * @brief NxN correlation with non-separable kernel. Borders up to half-kernel width are ignored
diff --git a/modules/fastcv/include/opencv2/fastcv/edges.hpp b/modules/fastcv/include/opencv2/fastcv/edges.hpp
index 7a947e80bff..dd2677bf415 100644
--- a/modules/fastcv/include/opencv2/fastcv/edges.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/edges.hpp
@@ -29,8 +29,7 @@ namespace fastcv {
  * @param borderType    Border type, support BORDER_CONSTANT, BORDER_REPLICATE
  * @param borderValue   Border value for constant border
 */
-CV_EXPORTS_W void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArray _dy, int kernel_size, int borderType,
-    int borderValue);
+CV_EXPORTS_W void sobel(InputArray _src, OutputArray _dx, OutputArray _dy, int kernel_size, int borderType, int borderValue);
 
 /**
  * @brief Creates a 2D gradient image from source luminance data without normalization.
@@ -43,7 +42,7 @@ CV_EXPORTS_W void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArra
  * @param ddepth        The depth of output image CV_8SC1,CV_16SC1,CV_32FC1,
  * @param normalization If do normalization for the result
 */
-CV_EXPORTS_W void sobel3x3u8(cv::InputArray _src, cv::OutputArray _dst, cv::OutputArray _dsty = noArray(), int ddepth = CV_8U,
+CV_EXPORTS_W void sobel3x3u8(InputArray _src, OutputArray _dst, OutputArray _dsty = noArray(), int ddepth = CV_8U,
     bool normalization = false);
 
 //! @}
diff --git a/modules/fastcv/include/opencv2/fastcv/warp.hpp b/modules/fastcv/include/opencv2/fastcv/warp.hpp
index 26a8be9f303..8f58cd36577 100644
--- a/modules/fastcv/include/opencv2/fastcv/warp.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/warp.hpp
@@ -27,8 +27,8 @@ namespace fastcv {
  * @param _M0       The 3x3 perspective transformation matrix (inversed map)
  * @param dsize     The output image size
 */
-CV_EXPORTS_W void warpPerspective2Plane(cv::InputArray _src1, cv::InputArray _src2, cv::OutputArray _dst1,
-    cv::OutputArray _dst2, InputArray _M0, Size dsize);
+CV_EXPORTS_W void warpPerspective2Plane(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2,
+    InputArray _M0, Size dsize);
 
 //! @}
 
diff --git a/modules/fastcv/src/blur.cpp b/modules/fastcv/src/blur.cpp
index 7c2efc06268..66058a37b5a 100644
--- a/modules/fastcv/src/blur.cpp
+++ b/modules/fastcv/src/blur.cpp
@@ -8,64 +8,65 @@
 namespace cv {
 namespace fastcv {
 
-class FcvGaussianBlurLoop_Invoker : public cv::ParallelLoopBody
+class FcvGaussianBlurLoop_Invoker : public ParallelLoopBody
 {
     public:
 
-    FcvGaussianBlurLoop_Invoker(const uchar* _src_data, size_t _src_step, uchar* _dst_data, size_t _dst_step, int _width,
-        int _height, int _ksize, int _depth, fcvBorderType _fcvBorder, int _fcvBorderValue) :
-        cv::ParallelLoopBody(), src_data(_src_data), src_step(_src_step), dst_data(_dst_data), dst_step(_dst_step), width(_width),
-        height(_height), ksize(_ksize), depth(_depth), fcvBorder(_fcvBorder), fcvBorderValue(_fcvBorderValue)
+    FcvGaussianBlurLoop_Invoker(const Mat& _src, Mat& _dst, int _ksize, fcvBorderType _fcvBorder, int _fcvBorderValue) :
+        ParallelLoopBody(), src(_src),dst(_dst), ksize(_ksize), fcvBorder(_fcvBorder), fcvBorderValue(_fcvBorderValue)
     {
-        half_ksize = ksize/2;
-        fcvFuncType = FCV_MAKETYPE(ksize,depth);
+        width       = src.cols;
+        height      = src.rows;
+        halfKsize   = ksize / 2;
+        fcvFuncType = FCV_MAKETYPE(ksize, src.depth());
     }
 
-    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    virtual void operator()(const Range& range) const CV_OVERRIDE
     {
-        int topLines    = 0;
-        int rangeHeight = range.end-range.start;
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
 
-        if(range.start >= half_ksize)
+        if(range.start != 0)
         {
-            topLines  += half_ksize;
-            rangeHeight += half_ksize;
+            topLines     += halfKsize;
+            paddedHeight += halfKsize;
         }
 
-        if(range.end <= height-half_ksize)
+        if(range.end != height)
         {
-            rangeHeight += half_ksize;
+            paddedHeight += halfKsize;
         }
 
-        const uchar* src = src_data + (range.start-topLines)*src_step;
-        std::vector<uchar> dst(dst_step*rangeHeight);
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
 
         if (fcvFuncType == FCV_MAKETYPE(3,CV_8U))
-            fcvFilterGaussian3x3u8_v4(src, width, rangeHeight, src_step, dst.data(), dst_step, fcvBorder, 0);
+            fcvFilterGaussian3x3u8_v4(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(5,CV_8U))
-            fcvFilterGaussian5x5u8_v3(src, width, rangeHeight, src_step, dst.data(), dst_step, fcvBorder, 0);
+            fcvFilterGaussian5x5u8_v3(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(5,CV_16S))
-            fcvFilterGaussian5x5s16_v3((int16_t*)src, width, rangeHeight, src_step, (int16_t*)dst.data(), dst_step, fcvBorder, 0);
+            fcvFilterGaussian5x5s16_v3((int16_t*)srcPadded.data, width, paddedHeight, srcPadded.step, (int16_t*)dstPadded.data,
+                dstPadded.step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(5,CV_32S))
-            fcvFilterGaussian5x5s32_v3((int32_t*)src, width, rangeHeight, src_step, (int32_t*)dst.data(), dst_step, fcvBorder, 0);
+            fcvFilterGaussian5x5s32_v3((int32_t*)srcPadded.data, width, paddedHeight, srcPadded.step, (int32_t*)dstPadded.data,
+                dstPadded.step, fcvBorder, 0);
         else if (fcvFuncType == FCV_MAKETYPE(11,CV_8U))
-            fcvFilterGaussian11x11u8_v2(src, width, rangeHeight, src_step, dst.data(), dst_step, fcvBorder);
+            fcvFilterGaussian11x11u8_v2(srcPadded.data, width, rangeHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder);
 
-        uchar *dptr = dst_data + range.start * dst_step;
-        uchar *sptr = dst.data() + topLines * dst_step;
-        memcpy(dptr, sptr, (range.end - range.start) * dst_step);
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
     }
 
     private:
-    const uchar*    src_data;
-    const size_t    src_step;
-    uchar*          dst_data;
-    const size_t    dst_step;
-    const int       width;
-    const int       height;
+    const Mat&      src;
+    Mat&            dst;
+    int             width;
+    int             height;
     const int       ksize;
-    const int       depth;
-    int             half_ksize;
+    int             halfKsize;
     int             fcvFuncType;
     fcvBorderType   fcvBorder;
     int             fcvBorderValue;
@@ -74,7 +75,7 @@ class FcvGaussianBlurLoop_Invoker : public cv::ParallelLoopBody
     const FcvGaussianBlurLoop_Invoker& operator= (const FcvGaussianBlurLoop_Invoker &);  // = delete;
 };
 
-void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int kernel_size, bool blur_border)
+void gaussianBlur(InputArray _src, OutputArray _dst, int kernel_size, bool blur_border)
 {
     INITIALIZATION_CHECK;
 
@@ -87,7 +88,8 @@ void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int kernel_size, bo
     Mat src = _src.getMat();
     Mat dst = _dst.getMat();
 
-    int nStripes = src.rows / 80 == 0 ? 1 : src.rows / 80;
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
 
     fcvBorderType fcvBorder = blur_border ? FASTCV_BORDER_ZERO_PADDING : FASTCV_BORDER_UNDEFINED;
 
@@ -95,68 +97,69 @@ void gaussianBlur(cv::InputArray _src, cv::OutputArray _dst, int kernel_size, bo
         ((type == CV_16SC1) && (kernel_size == 5)) ||
         ((type == CV_32SC1) && (kernel_size == 5)))
     {
-        cv::parallel_for_(cv::Range(0, src.rows),
-            FcvGaussianBlurLoop_Invoker(src.data, src.step, dst.data, dst.step, src.cols, src.rows, kernel_size,
-            src.depth(), fcvBorder, 0), nStripes);
+        parallel_for_(Range(0, src.rows), FcvGaussianBlurLoop_Invoker(src, dst, kernel_size, fcvBorder, 0), nStripes);
     }
     else
         CV_Error(cv::Error::StsBadArg, cv::format("Src type %d, kernel size %d is not supported", type, kernel_size));
 }
 
-class FcvFilter2DLoop_Invoker : public cv::ParallelLoopBody
+class FcvFilter2DLoop_Invoker : public ParallelLoopBody
 {
     public:
 
-    FcvFilter2DLoop_Invoker(const uchar* _src_data, size_t _src_step, uchar* _dst_data, size_t _dst_step, const int _ddepth,
-        int _width, int _height, uchar* _kernel,int _ksize ) :
-        cv::ParallelLoopBody(), src_data(_src_data), src_step(_src_step), dst_data(_dst_data), dst_step(_dst_step),
-        ddepth(_ddepth), width(_width),height(_height), kernel(_kernel), ksize(_ksize)
+    FcvFilter2DLoop_Invoker(const Mat& _src, Mat& _dst, const Mat& _kernel) :
+        ParallelLoopBody(), src(_src), dst(_dst), kernel(_kernel)
     {
-        half_ksize = ksize/2;
+        width     = src.cols;
+        height    = src.rows;
+        ksize     = kernel.size().width;
+        halfKsize = ksize/2;
     }
 
-    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    virtual void operator()(const Range& range) const CV_OVERRIDE
     {
-        int topLines    = 0;
-        int rangeHeight = range.end-range.start;
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
 
-        if(range.start >= half_ksize)
+        if(range.start >= halfKsize)
         {
-            topLines  += half_ksize;
-            rangeHeight += half_ksize;
+            topLines    += halfKsize;
+            paddedHeight += halfKsize;
         }
 
-        if(range.end <= height-half_ksize)
+        if(range.end <= height-halfKsize)
         {
-            rangeHeight += half_ksize;
+            paddedHeight += halfKsize;
         }
 
-        const uchar *src = src_data + (range.start - topLines) * src_step;
-        std::vector<uchar> dst(dst_step*rangeHeight);
-
-        if (ddepth == CV_8U)
-            fcvFilterCorrNxNu8((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, dst.data(), dst_step);
-        else if (ddepth == CV_16S)
-            fcvFilterCorrNxNu8s16((int8_t*)kernel, ksize, 0, src, width, rangeHeight, src_step, (int16_t*)dst.data(), dst_step);
-        else if (ddepth == CV_32F)
-            fcvFilterCorrNxNu8f32((float32_t*)kernel, ksize, src, width, rangeHeight, src_step, (float32_t*)dst.data(), dst_step);
-
-        uchar *dptr = dst_data + range.start * dst_step;
-        uchar *sptr = dst.data() + topLines * dst_step;
-        memcpy(dptr, sptr, (range.end - range.start) * dst_step);
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
+
+        if (dst.depth() == CV_8U)
+            fcvFilterCorrNxNu8((int8_t*)kernel.data, ksize, 0, srcPadded.data, width, paddedHeight, srcPadded.step,
+                dstPadded.data, dstPadded.step);
+        else if (dst.depth() == CV_16S)
+            fcvFilterCorrNxNu8s16((int8_t*)kernel.data, ksize, 0, srcPadded.data, width, paddedHeight, srcPadded.step,
+                (int16_t*)dstPadded.data, dstPadded.step);
+        else if (dst.depth() == CV_32F)
+            fcvFilterCorrNxNu8f32((float32_t*)kernel.data, ksize, srcPadded.data, width, paddedHeight, srcPadded.step,
+                (float32_t*)dstPadded.data, dstPadded.step);
+
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
     }
 
     private:
-    const uchar*    src_data;
-    const size_t    src_step;
-    uchar*          dst_data;
-    const size_t    dst_step;
-    const int       ddepth;
-    const int       width;
-    const int       height;
-    uchar*          kernel;
-    const int       ksize;
-    int             half_ksize;
+    const Mat&  src;
+    Mat&        dst;
+    const Mat&  kernel;
+    int         width;
+    int         height;
+    int         ksize;
+    int         halfKsize;
 
     FcvFilter2DLoop_Invoker(const FcvFilter2DLoop_Invoker &);  // = delete;
     const FcvFilter2DLoop_Invoker& operator= (const FcvFilter2DLoop_Invoker &);  // = delete;
@@ -176,7 +179,8 @@ void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel)
     Mat src = _src.getMat();
     Mat dst = _dst.getMat();
 
-    int nStripes = src.rows / 80 == 0 ? 1 : src.rows / 80;
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
 
     switch (ddepth)
     {
@@ -184,19 +188,13 @@ void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel)
         case CV_16S:
         {
             CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_8S);
-
-            cv::parallel_for_(cv::Range(0, src.rows),
-            FcvFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernel.data, ksize.width),
-            nStripes);
+            parallel_for_(Range(0, src.rows), FcvFilter2DLoop_Invoker(src, dst, kernel), nStripes);
             break;
         }
         case CV_32F:
         {
             CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_32F);
-
-            cv::parallel_for_(cv::Range(0, src.rows),
-            FcvFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernel.data, ksize.width),
-            nStripes);
+            parallel_for_(Range(0, src.rows), FcvFilter2DLoop_Invoker(src, dst, kernel), nStripes);
             break;
         }
         default:
@@ -208,85 +206,88 @@ void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel)
     }
 }
 
-class FcvSepFilter2DLoop_Invoker : public cv::ParallelLoopBody
+class FcvSepFilter2DLoop_Invoker : public ParallelLoopBody
 {
     public:
 
-    FcvSepFilter2DLoop_Invoker(const uchar* _src_data, size_t _src_step, uchar* _dst_data, size_t _dst_step, const int _ddepth,
-        int _width, int _height, uchar* _kernelX, int _kernelXSize, uchar* _kernelY,int _kernelYSize) :
-        cv::ParallelLoopBody(), src_data(_src_data), src_step(_src_step), dst_data(_dst_data), dst_step(_dst_step), ddepth(_ddepth),
-        width(_width), height(_height), kernelX(_kernelX), kernelXSize(_kernelXSize), kernelY(_kernelY), kernelYSize(_kernelYSize)
+    FcvSepFilter2DLoop_Invoker(const Mat& _src, Mat& _dst, const Mat& _kernelX, const Mat& _kernelY) :
+        ParallelLoopBody(), src(_src), dst(_dst), kernelX(_kernelX), kernelY(_kernelY)
     {
-        half_ksize = kernelYSize/2;
+        width       = src.cols;
+        height      = src.rows;
+        kernelXSize = kernelX.size().width;
+        kernelYSize = kernelY.size().width;
+        halfKsize   = kernelXSize/2;
     }
 
-    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    virtual void operator()(const Range& range) const CV_OVERRIDE
     {
-        int topLines    = 0;
-        int rangeHeight = range.end-range.start;
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
 
-        if(range.start >= half_ksize)
+        if(range.start >= halfKsize)
         {
-            topLines  += half_ksize;
-            rangeHeight += half_ksize;
+            topLines     += halfKsize;
+            paddedHeight += halfKsize;
         }
 
-        if(range.end <= height-half_ksize)
+        if(range.end <= height-halfKsize)
         {
-            rangeHeight += half_ksize;
+            paddedHeight += halfKsize;
         }
 
-        const uchar *src = src_data + (range.start - topLines) * src_step;
-        std::vector<uchar> dst(dst_step*rangeHeight);
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
 
-        switch (ddepth)
+        switch (dst.depth())
         {
             case CV_8U:
             {
-                fcvFilterCorrSepMxNu8((int8_t*)kernelX, kernelXSize, (int8_t*)kernelY, kernelYSize, 0, src, width, rangeHeight,
-                    src_step, dst.data(), dst_step);
+                fcvFilterCorrSepMxNu8((int8_t*)kernelX.data, kernelXSize, (int8_t*)kernelY.data, kernelYSize, 0, srcPadded.data,
+                    width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step);
                 break;
             }
             case CV_16S:
             {
-                std::vector<int16_t> tmpImage(width*(rangeHeight+kernelXSize-1));
+                std::vector<int16_t> tmpImage(width * (paddedHeight + kernelXSize - 1));
                 switch (kernelXSize)
                 {
                     case 9:
                     {
-                        fcvFilterCorrSep9x9s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
+                        fcvFilterCorrSep9x9s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
                         break;
                     }
                     case 11:
                     {
-                        fcvFilterCorrSep11x11s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
+                        fcvFilterCorrSep11x11s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
                         break;
                     }
                     case 13:
                     {
-                        fcvFilterCorrSep13x13s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
+                        fcvFilterCorrSep13x13s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
                         break;
                     }
                     case 15:
                     {
-                        fcvFilterCorrSep15x15s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
+                        fcvFilterCorrSep15x15s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
                         break;
                     }
                     case 17:
                     {
-                        fcvFilterCorrSep17x17s16_v2((int16_t*)kernelX, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
+                        fcvFilterCorrSep17x17s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
                         break;
                     }
 
                     default:
                     {
-                        fcvFilterCorrSepNxNs16((int16_t*)kernelX, kernelXSize, (int16_t*)src, width, rangeHeight, src_step,
-                            tmpImage.data(), (int16_t*)dst.data(), dst_step);
+                        fcvFilterCorrSepNxNs16((int16_t*)kernelX.data, kernelXSize, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
                         break;
                     }
                 }
@@ -294,29 +295,27 @@ class FcvSepFilter2DLoop_Invoker : public cv::ParallelLoopBody
             }
             default:
             {
-                CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(ddepth)));
+                CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(dst.depth())));
                 break;
             }
         }
 
-        uchar *dptr = dst_data + range.start * dst_step;
-        uchar *sptr = dst.data() + topLines * dst_step;
-        memcpy(dptr, sptr, (range.end - range.start) * dst_step);
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
     }
 
     private:
-    const uchar*    src_data;
-    const size_t    src_step;
-    uchar*          dst_data;
-    const size_t    dst_step;
-    const int       ddepth;
-    const int       width;
-    const int       height;
-    uchar*          kernelX;
-    const int       kernelXSize;
-    uchar*          kernelY;
-    const int       kernelYSize;
-    int             half_ksize;
+    const Mat&  src;
+    Mat&        dst;
+    int         width;
+    int         height;
+    const Mat&  kernelX;
+    const Mat&  kernelY;
+    int         kernelXSize;
+    int         kernelYSize;
+    int         halfKsize;
 
     FcvSepFilter2DLoop_Invoker(const FcvSepFilter2DLoop_Invoker &);  // = delete;
     const FcvSepFilter2DLoop_Invoker& operator= (const FcvSepFilter2DLoop_Invoker &);  // = delete;
@@ -332,14 +331,14 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kern
     Mat kernelX = _kernelX.getMat();
     Mat kernelY = _kernelY.getMat();
 
-    int nStripes = src.rows / 80 == 0 ? 1 : src.rows / 80;
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
+
     switch (ddepth)
     {
         case CV_8U:
         {
-            cv::parallel_for_(cv::Range(0, src.rows),
-            FcvSepFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernelX.data,
-                kernelX.size().width, kernelY.data, kernelY.size().width),nStripes);
+            cv::parallel_for_(cv::Range(0, src.rows), FcvSepFilter2DLoop_Invoker(src, dst, kernelX, kernelY), nStripes);
             break;
         }
         case CV_16S:
@@ -351,9 +350,7 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kern
             absdiff(kernelX, kernelY, diff);
             CV_Assert(countNonZero(diff) == 0);
 
-            cv::parallel_for_(cv::Range(0, src.rows),
-            FcvSepFilter2DLoop_Invoker(src.data, src.step, dst.data, dst.step, ddepth, src.cols, src.rows, kernelX.data,
-                kernelX.size().width, kernelY.data, kernelY.size().width),nStripes);
+            cv::parallel_for_(cv::Range(0, src.rows), FcvSepFilter2DLoop_Invoker(src, dst, kernelX, kernelY), nStripes);
             break;
         }
         default:

From a7b9959558cc631b0c4649abcdedec5088e8312e Mon Sep 17 00:00:00 2001
From: xue zhang <xuezha@qti.qualcomm.com>
Date: Thu, 19 Dec 2024 14:46:53 +0530
Subject: [PATCH 10/11] fix python binding issue

---
 .../fastcv/include/opencv2/fastcv/mser.hpp    | 28 +++++-----
 modules/fastcv/perf/perf_mser.cpp             | 12 ++---
 modules/fastcv/src/mser.cpp                   | 54 +++++++++----------
 modules/fastcv/src/pyramid.cpp                |  2 +-
 modules/fastcv/src/remap.cpp                  | 18 +++----
 modules/fastcv/test/test_mser.cpp             | 12 ++---
 6 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/modules/fastcv/include/opencv2/fastcv/mser.hpp b/modules/fastcv/include/opencv2/fastcv/mser.hpp
index 08b751fe81d..bfa898544f5 100644
--- a/modules/fastcv/include/opencv2/fastcv/mser.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/mser.hpp
@@ -18,7 +18,7 @@ namespace fastcv {
  * @brief MSER blob detector for grayscale images
  *
  */
-class CV_EXPORTS_W MSER
+class CV_EXPORTS_W FCVMSER
 {
 public:
 
@@ -52,13 +52,13 @@ class CV_EXPORTS_W MSER
                            Typical value range [0.1 1.0], typical value 0.2
      * @return Feature detector object ready for detection
      */
-    CV_WRAP static Ptr<MSER> create(cv::Size     imgSize,
-                                    unsigned int numNeighbors = 4,
-                                    unsigned int delta = 2,
-                                    unsigned int minArea = 30,
-                                    unsigned int maxArea = 14400,
-                                    float        maxVariation = 0.15f,
-                                    float        minDiversity = 0.2f);
+    CV_WRAP static Ptr<FCVMSER> create( cv::Size     imgSize,
+                                        uint32_t numNeighbors = 4,
+                                        uint32_t delta = 2,
+                                        uint32_t minArea = 30,
+                                        uint32_t maxArea = 14400,
+                                        float        maxVariation = 0.15f,
+                                        float        minDiversity = 0.2f);
 
     /**
      * @brief This is an overload for detect() function
@@ -94,18 +94,18 @@ class CV_EXPORTS_W MSER
     * @param boundingBoxes Array containing bounding boxes of found contours
     * @param contourData Array containing additional information about found contours
     */
-    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+    virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
                                 std::vector<ContourData>& contourData) = 0;
 
     CV_WRAP virtual cv::Size     getImgSize()      = 0;
-    CV_WRAP virtual unsigned int getNumNeighbors() = 0;
-    CV_WRAP virtual unsigned int getDelta()        = 0;
-    CV_WRAP virtual unsigned int getMinArea()      = 0;
-    CV_WRAP virtual unsigned int getMaxArea()      = 0;
+    CV_WRAP virtual uint32_t getNumNeighbors() = 0;
+    CV_WRAP virtual uint32_t getDelta()        = 0;
+    CV_WRAP virtual uint32_t getMinArea()      = 0;
+    CV_WRAP virtual uint32_t getMaxArea()      = 0;
     CV_WRAP virtual float        getMaxVariation() = 0;
     CV_WRAP virtual float        getMinDiversity() = 0;
 
-    virtual ~MSER() {}
+    virtual ~FCVMSER() {}
 };
 
 //! @}
diff --git a/modules/fastcv/perf/perf_mser.cpp b/modules/fastcv/perf/perf_mser.cpp
index 11787f4c99e..7232cd47cb4 100644
--- a/modules/fastcv/perf/perf_mser.cpp
+++ b/modules/fastcv/perf/perf_mser.cpp
@@ -30,21 +30,21 @@ PERF_TEST_P(MSERPerfTest, run,
 
     cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
 
-    unsigned int delta = 2;
-    unsigned int minArea = 256;
-    unsigned int maxArea = (int)src.total()/4;
+    uint32_t delta = 2;
+    uint32_t minArea = 256;
+    uint32_t maxArea = (int)src.total()/4;
     float        maxVariation = 0.15f;
     float        minDiversity = 0.2f;
 
-    cv::Ptr<cv::fastcv::MSER> mser;
-    mser = cv::fastcv::MSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+    cv::Ptr<cv::fastcv::FCVMSER> mser;
+    mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
                                     maxVariation, minDiversity);
 
     while(next())
     {
         std::vector<std::vector<Point>> contours;
         std::vector<cv::Rect> bboxes;
-        std::vector<cv::fastcv::MSER::ContourData> contourData;
+        std::vector<cv::fastcv::FCVMSER::ContourData> contourData;
 
         startTimer();
         if (useBboxes)
diff --git a/modules/fastcv/src/mser.cpp b/modules/fastcv/src/mser.cpp
index a564e007a12..6919099a482 100644
--- a/modules/fastcv/src/mser.cpp
+++ b/modules/fastcv/src/mser.cpp
@@ -8,24 +8,24 @@
 namespace cv {
 namespace fastcv {
 
-class MSER_Impl CV_FINAL : public cv::fastcv::MSER
+class MSER_Impl CV_FINAL : public cv::fastcv::FCVMSER
 {
 public:
     explicit MSER_Impl(cv::Size     imgSize,
-                       unsigned int numNeighbors,
-                       unsigned int delta,
-                       unsigned int minArea,
-                       unsigned int maxArea,
+                       uint32_t numNeighbors,
+                       uint32_t delta,
+                       uint32_t minArea,
+                       uint32_t maxArea,
                        float        maxVariation,
                        float        minDiversity);
 
     ~MSER_Impl() CV_OVERRIDE;
 
     cv::Size     getImgSize()      CV_OVERRIDE { return imgSize;      };
-    unsigned int getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
-    unsigned int getDelta()        CV_OVERRIDE { return delta;        };
-    unsigned int getMinArea()      CV_OVERRIDE { return minArea;      };
-    unsigned int getMaxArea()      CV_OVERRIDE { return maxArea;      };
+    uint32_t getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
+    uint32_t getDelta()        CV_OVERRIDE { return delta;        };
+    uint32_t getMinArea()      CV_OVERRIDE { return minArea;      };
+    uint32_t getMaxArea()      CV_OVERRIDE { return maxArea;      };
     float        getMaxVariation() CV_OVERRIDE { return maxVariation; };
     float        getMinDiversity() CV_OVERRIDE { return minDiversity; };
 
@@ -42,10 +42,10 @@ class MSER_Impl CV_FINAL : public cv::fastcv::MSER
                        bool useContourData = true);
 
     cv::Size imgSize;
-    unsigned int numNeighbors;
-    unsigned int delta;
-    unsigned int minArea;
-    unsigned int maxArea;
+    uint32_t numNeighbors;
+    uint32_t delta;
+    uint32_t minArea;
+    uint32_t maxArea;
     float        maxVariation;
     float        minDiversity;
 
@@ -54,10 +54,10 @@ class MSER_Impl CV_FINAL : public cv::fastcv::MSER
 
 
 MSER_Impl::MSER_Impl(cv::Size     _imgSize,
-                     unsigned int _numNeighbors,
-                     unsigned int _delta,
-                     unsigned int _minArea,
-                     unsigned int _maxArea,
+                     uint32_t _numNeighbors,
+                     uint32_t _delta,
+                     uint32_t _minArea,
+                     uint32_t _maxArea,
                      float        _maxVariation,
                      float        _minDiversity)
 {
@@ -103,14 +103,14 @@ void MSER_Impl::detectRegions(InputArray _src, std::vector<std::vector<Point>>&
     bool usePointsArray = (this->numNeighbors == 8);
 
     //bufSize for pts and bboxes
-    const unsigned int maxContours = 16384;
-    unsigned int numContours;
+    const uint32_t maxContours = 16384;
+    uint32_t numContours;
     std::vector<uint32_t> numPointsInContour(maxContours);
 
     std::vector<uint16_t> rectArray;
     rectArray.resize(4 * maxContours); // xMin, xMax, yMax, yMin
 
-    unsigned int pointsArraySize = src.total() * 30; // Recommended typical size
+    uint32_t pointsArraySize = src.total() * 30; // Recommended typical size
     std::vector<uint16_t> pointsArray;
     std::vector<uint32_t> contourStartingPoints;
     uint32_t pathArraySize = src.total() * 4; // Recommended size
@@ -244,13 +244,13 @@ void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours
     this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ true);
 }
 
-Ptr<MSER> MSER::create(cv::Size     imgSize,
-                       unsigned int numNeighbors,
-                       unsigned int delta,
-                       unsigned int minArea,
-                       unsigned int maxArea,
-                       float        maxVariation,
-                       float        minDiversity)
+Ptr<FCVMSER> FCVMSER::create(cv::Size     imgSize,
+                             uint32_t numNeighbors,
+                             uint32_t delta,
+                             uint32_t minArea,
+                             uint32_t maxArea,
+                             float        maxVariation,
+                             float        minDiversity)
 {
     return makePtr<MSER_Impl>(imgSize, numNeighbors, delta, minArea, maxArea, maxVariation, minDiversity);
 }
diff --git a/modules/fastcv/src/pyramid.cpp b/modules/fastcv/src/pyramid.cpp
index 79a59086e51..806c8e9970f 100644
--- a/modules/fastcv/src/pyramid.cpp
+++ b/modules/fastcv/src/pyramid.cpp
@@ -169,7 +169,7 @@ void buildPyramid(InputArray _src, OutputArrayOfArrays _pyr, int nLevels, bool s
     _pyr.create(nLevels, 1, src.type(), -1);
     for (int i = 0; i < nLevels; i++)
     {
-        cv::Mat m = cv::Mat((unsigned int)lpyrSrc2[i].height, (unsigned int)lpyrSrc2[i].width,
+        cv::Mat m = cv::Mat((uint32_t)lpyrSrc2[i].height, (uint32_t)lpyrSrc2[i].width,
                              src.type(), (void*)lpyrSrc2[i].ptr, (size_t)lpyrSrc2[i].stride);
 
         _pyr.create(m.size(), m.type(), i);
diff --git a/modules/fastcv/src/remap.cpp b/modules/fastcv/src/remap.cpp
index 0c86d65c97e..933bfdc4273 100644
--- a/modules/fastcv/src/remap.cpp
+++ b/modules/fastcv/src/remap.cpp
@@ -10,8 +10,8 @@ namespace fastcv {
 
 class RemapParallel : public cv::ParallelLoopBody {
 public:
-    RemapParallel(int src_type, const uint8_t* src, unsigned int srcWidth, unsigned int srcHeight, unsigned int srcStride, uint8_t* dst,
-                unsigned int dstWidth, unsigned int dstHeight, unsigned int dstStride, const float32_t* __restrict  mapX,
+    RemapParallel(int src_type, const uint8_t* src, uint32_t srcWidth, uint32_t srcHeight, uint32_t srcStride, uint8_t* dst,
+                uint32_t dstWidth, uint32_t dstHeight, uint32_t dstStride, const float32_t* __restrict  mapX,
                 const float32_t* __restrict mapY, uint32_t mapStride, fcvInterpolationType interpolation, uint8_t borderValue)
                 : src_type_(src_type), src_(src), srcWidth_(srcWidth), srcHeight_(srcHeight), srcStride_(srcStride), dst_(dst), dstWidth_(dstWidth),
                 dstHeight_(dstHeight), dstStride_(dstStride), mapX_(mapX), mapY_(mapY), mapStride_(mapStride),
@@ -51,16 +51,16 @@ class RemapParallel : public cv::ParallelLoopBody {
 private:
     int src_type_;
     const uint8_t* src_;
-    unsigned int srcWidth_;
-    unsigned int srcHeight_;
-    unsigned int srcStride_;
+    uint32_t srcWidth_;
+    uint32_t srcHeight_;
+    uint32_t srcStride_;
     uint8_t* dst_;
-    unsigned int dstWidth_;
-    unsigned int dstHeight_;
-    unsigned int dstStride_;
+    uint32_t dstWidth_;
+    uint32_t dstHeight_;
+    uint32_t dstStride_;
     const float32_t* __restrict mapX_;
     const float32_t* __restrict mapY_;
-    unsigned int mapStride_;
+    uint32_t mapStride_;
     fcvInterpolationType fcvInterpolation_;
     uint8_t borderValue_;
 };
diff --git a/modules/fastcv/test/test_mser.cpp b/modules/fastcv/test/test_mser.cpp
index 6f2bf78c4cd..29cae5808a7 100644
--- a/modules/fastcv/test/test_mser.cpp
+++ b/modules/fastcv/test/test_mser.cpp
@@ -23,17 +23,17 @@ TEST_P(MSERTest, accuracy)
 
     cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
 
-    unsigned int delta = 2;
-    unsigned int minArea = 256;
-    unsigned int maxArea = (int)src.total()/4;
+    uint32_t delta = 2;
+    uint32_t minArea = 256;
+    uint32_t maxArea = (int)src.total()/4;
     float        maxVariation = 0.15f;
     float        minDiversity = 0.2f;
 
     std::vector<std::vector<Point>> contours;
     std::vector<cv::Rect> bboxes;
-    std::vector<cv::fastcv::MSER::ContourData> contourData;
-    cv::Ptr<cv::fastcv::MSER> mser;
-    mser = cv::fastcv::MSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+    std::vector<cv::fastcv::FCVMSER::ContourData> contourData;
+    cv::Ptr<cv::fastcv::FCVMSER> mser;
+    mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
                                     maxVariation, minDiversity);
     if (useBboxes)
     {

From 453b55a37b5ca54ccad07411ef8c57f3fb549ded Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@opencv.ai>
Date: Fri, 20 Dec 2024 14:51:04 +0300
Subject: [PATCH 11/11] Fixed Java and Python bindings generation.

---
 .../fastcv/include/opencv2/fastcv/mser.hpp    | 30 ++++----
 modules/fastcv/perf/perf_mser.cpp             |  2 +-
 modules/fastcv/src/mser.cpp                   | 69 ++++++++++---------
 modules/fastcv/test/test_mser.cpp             |  2 +-
 4 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/modules/fastcv/include/opencv2/fastcv/mser.hpp b/modules/fastcv/include/opencv2/fastcv/mser.hpp
index bfa898544f5..249c0e14e2b 100644
--- a/modules/fastcv/include/opencv2/fastcv/mser.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/mser.hpp
@@ -52,13 +52,13 @@ class CV_EXPORTS_W FCVMSER
                            Typical value range [0.1 1.0], typical value 0.2
      * @return Feature detector object ready for detection
      */
-    CV_WRAP static Ptr<FCVMSER> create( cv::Size     imgSize,
-                                        uint32_t numNeighbors = 4,
-                                        uint32_t delta = 2,
-                                        uint32_t minArea = 30,
-                                        uint32_t maxArea = 14400,
-                                        float        maxVariation = 0.15f,
-                                        float        minDiversity = 0.2f);
+    CV_WRAP static Ptr<FCVMSER> create( const cv::Size& imgSize,
+                                        int numNeighbors = 4,
+                                        int delta = 2,
+                                        int minArea = 30,
+                                        int maxArea = 14400,
+                                        float maxVariation = 0.15f,
+                                        float minDiversity = 0.2f);
 
     /**
      * @brief This is an overload for detect() function
@@ -95,15 +95,15 @@ class CV_EXPORTS_W FCVMSER
     * @param contourData Array containing additional information about found contours
     */
     virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                                std::vector<ContourData>& contourData) = 0;
+                        std::vector<ContourData>& contourData) = 0;
 
-    CV_WRAP virtual cv::Size     getImgSize()      = 0;
-    CV_WRAP virtual uint32_t getNumNeighbors() = 0;
-    CV_WRAP virtual uint32_t getDelta()        = 0;
-    CV_WRAP virtual uint32_t getMinArea()      = 0;
-    CV_WRAP virtual uint32_t getMaxArea()      = 0;
-    CV_WRAP virtual float        getMaxVariation() = 0;
-    CV_WRAP virtual float        getMinDiversity() = 0;
+    CV_WRAP virtual cv::Size getImgSize() = 0;
+    CV_WRAP virtual int getNumNeighbors() = 0;
+    CV_WRAP virtual int getDelta()        = 0;
+    CV_WRAP virtual int getMinArea()      = 0;
+    CV_WRAP virtual int getMaxArea()      = 0;
+    CV_WRAP virtual float getMaxVariation() = 0;
+    CV_WRAP virtual float getMinDiversity() = 0;
 
     virtual ~FCVMSER() {}
 };
diff --git a/modules/fastcv/perf/perf_mser.cpp b/modules/fastcv/perf/perf_mser.cpp
index 7232cd47cb4..36f876cd045 100644
--- a/modules/fastcv/perf/perf_mser.cpp
+++ b/modules/fastcv/perf/perf_mser.cpp
@@ -38,7 +38,7 @@ PERF_TEST_P(MSERPerfTest, run,
 
     cv::Ptr<cv::fastcv::FCVMSER> mser;
     mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
-                                    maxVariation, minDiversity);
+                                       maxVariation, minDiversity);
 
     while(next())
     {
diff --git a/modules/fastcv/src/mser.cpp b/modules/fastcv/src/mser.cpp
index 6919099a482..a44cecae073 100644
--- a/modules/fastcv/src/mser.cpp
+++ b/modules/fastcv/src/mser.cpp
@@ -12,22 +12,22 @@ class MSER_Impl CV_FINAL : public cv::fastcv::FCVMSER
 {
 public:
     explicit MSER_Impl(cv::Size     imgSize,
-                       uint32_t numNeighbors,
-                       uint32_t delta,
-                       uint32_t minArea,
-                       uint32_t maxArea,
-                       float        maxVariation,
-                       float        minDiversity);
+                       int numNeighbors,
+                       int delta,
+                       int minArea,
+                       int maxArea,
+                       float maxVariation,
+                       float minDiversity);
 
     ~MSER_Impl() CV_OVERRIDE;
 
-    cv::Size     getImgSize()      CV_OVERRIDE { return imgSize;      };
-    uint32_t getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
-    uint32_t getDelta()        CV_OVERRIDE { return delta;        };
-    uint32_t getMinArea()      CV_OVERRIDE { return minArea;      };
-    uint32_t getMaxArea()      CV_OVERRIDE { return maxArea;      };
-    float        getMaxVariation() CV_OVERRIDE { return maxVariation; };
-    float        getMinDiversity() CV_OVERRIDE { return minDiversity; };
+    cv::Size getImgSize()      CV_OVERRIDE { return imgSize;      };
+    int getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
+    int getDelta()        CV_OVERRIDE { return delta;        };
+    int getMinArea()      CV_OVERRIDE { return minArea;      };
+    int getMaxArea()      CV_OVERRIDE { return maxArea;      };
+    float getMaxVariation() CV_OVERRIDE { return maxVariation; };
+    float getMinDiversity() CV_OVERRIDE { return minDiversity; };
 
     void detect(InputArray src, std::vector<std::vector<Point>>& contours) CV_OVERRIDE;
     void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) CV_OVERRIDE;
@@ -42,24 +42,24 @@ class MSER_Impl CV_FINAL : public cv::fastcv::FCVMSER
                        bool useContourData = true);
 
     cv::Size imgSize;
-    uint32_t numNeighbors;
-    uint32_t delta;
-    uint32_t minArea;
-    uint32_t maxArea;
-    float        maxVariation;
-    float        minDiversity;
+    int numNeighbors;
+    int delta;
+    int minArea;
+    int maxArea;
+    float maxVariation;
+    float minDiversity;
 
     void *mserHandle;
 };
 
 
-MSER_Impl::MSER_Impl(cv::Size     _imgSize,
-                     uint32_t _numNeighbors,
-                     uint32_t _delta,
-                     uint32_t _minArea,
-                     uint32_t _maxArea,
-                     float        _maxVariation,
-                     float        _minDiversity)
+MSER_Impl::MSER_Impl(cv::Size _imgSize,
+                     int _numNeighbors,
+                     int _delta,
+                     int _minArea,
+                     int _maxArea,
+                     float _maxVariation,
+                     float _minDiversity)
 {
     CV_Assert(_imgSize.width > 50);
     CV_Assert(_imgSize.height > 5);
@@ -244,16 +244,17 @@ void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours
     this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ true);
 }
 
-Ptr<FCVMSER> FCVMSER::create(cv::Size     imgSize,
-                             uint32_t numNeighbors,
-                             uint32_t delta,
-                             uint32_t minArea,
-                             uint32_t maxArea,
-                             float        maxVariation,
-                             float        minDiversity)
+Ptr<FCVMSER> FCVMSER::create(const cv::Size& imgSize,
+                             int numNeighbors,
+                             int delta,
+                             int minArea,
+                             int maxArea,
+                             float maxVariation,
+                             float minDiversity)
 {
+    CV_Assert(numNeighbors > 0 && delta >= 0 && minArea >= 0 && maxArea >= 0);
     return makePtr<MSER_Impl>(imgSize, numNeighbors, delta, minArea, maxArea, maxVariation, minDiversity);
 }
 
 } // fastcv::
-} // cv::
\ No newline at end of file
+} // cv::
diff --git a/modules/fastcv/test/test_mser.cpp b/modules/fastcv/test/test_mser.cpp
index 29cae5808a7..d3cb35bf47e 100644
--- a/modules/fastcv/test/test_mser.cpp
+++ b/modules/fastcv/test/test_mser.cpp
@@ -175,4 +175,4 @@ INSTANTIATE_TEST_CASE_P(FastCV_Extension, MSERTest,
                        ::testing::Values("cv/shared/baboon.png", "cv/mser/puzzle.png")
                       )
     );
-}} // namespaces opencv_test, ::
\ No newline at end of file
+}} // namespaces opencv_test, ::