diff --git a/modules/fastcv/README.md b/modules/fastcv/README.md
index 0c7323c086c..076a4108de0 100644
--- a/modules/fastcv/README.md
+++ b/modules/fastcv/README.md
@@ -3,5 +3,4 @@ FastCV extension for OpenCV
 
 This module provides wrappers for several FastCV functions not covered by the corresponding HAL in OpenCV or have implementation incompatible with OpenCV.
 Please note that:
-1. This module supports ARM architecture only. This means that CMake script aborts configuration under x86 platform even if you don't want to build binaries for your machine and just want to build docs or enable code analysis in your IDE. In that case you should fix CMakeLists.txt file as told inside it.
-2. Test data is stored in misc folder. Before running tests on a device you should copy the content of `misc/` folder to `$YOUR_TESTDATA_PATH/fastcv/` folder on a device.
+1. This module supports ARM architecture only. This means that CMake script will not configure or build under x86 platform.
\ No newline at end of file
diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
index fcf0bf132fb..6ed8eba4a33 100644
--- a/modules/fastcv/include/opencv2/fastcv.hpp
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -10,18 +10,24 @@
 
 #include "opencv2/fastcv/arithm.hpp"
 #include "opencv2/fastcv/bilateralFilter.hpp"
+#include "opencv2/fastcv/blur.hpp"
 #include "opencv2/fastcv/cluster.hpp"
 #include "opencv2/fastcv/draw.hpp"
+#include "opencv2/fastcv/edges.hpp"
 #include "opencv2/fastcv/fast10.hpp"
 #include "opencv2/fastcv/fft.hpp"
 #include "opencv2/fastcv/hough.hpp"
+#include "opencv2/fastcv/ipptransform.hpp"
 #include "opencv2/fastcv/moments.hpp"
 #include "opencv2/fastcv/mser.hpp"
+#include "opencv2/fastcv/pyramid.hpp"
 #include "opencv2/fastcv/remap.hpp"
 #include "opencv2/fastcv/scale.hpp"
 #include "opencv2/fastcv/shift.hpp"
 #include "opencv2/fastcv/smooth.hpp"
 #include "opencv2/fastcv/thresh.hpp"
+#include "opencv2/fastcv/tracking.hpp"
+#include "opencv2/fastcv/warp.hpp"
 
 /**
  * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
@@ -29,4 +35,4 @@
  * @}
  */
 
-#endif // OPENCV_FASTCV_ARITHM_HPP
+#endif // OPENCV_FASTCV_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/arithm.hpp b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
index e479d970b1d..5a0c43b2408 100644
--- a/modules/fastcv/include/opencv2/fastcv/arithm.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Matrix multiplication of two int8_t type matrices
-
+ *		  uses signed integer input/output whereas cv::gemm uses floating point input/output
+ *        matmuls8s32 provides enhanced speed on Qualcomm's processors
  * @param src1 First source matrix of type CV_8S
  * @param src2 Second source matrix of type CV_8S
  * @param dst Resulting matrix of type CV_32S
diff --git a/modules/fastcv/include/opencv2/fastcv/blur.hpp b/modules/fastcv/include/opencv2/fastcv/blur.hpp
new file mode 100644
index 00000000000..99d1cd3d655
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/blur.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_BLUR_HPP
+#define OPENCV_FASTCV_BLUR_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ */
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Gaussian blur with sigma = 0 and square kernel size. The way of handling borders is different with cv::GaussianBlur,
+ *        leading to slight variations in the output.
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1
+ * @param kernel_size Filer kernel size. One of 3, 5, 11
+ * @param blur_border If set to true, border is blurred by 0-padding adjacent values.(A variant of the constant border)
+ *                    If set to false, borders up to half-kernel width are ignored (e.g. 1 pixel in the 3x3 case).
+ *
+ * @sa GaussianBlur
+ */
+CV_EXPORTS_W void gaussianBlur(InputArray _src, OutputArray _dst, int kernel_size = 3, bool blur_border = true);
+
+/**
+ * @brief NxN correlation with non-separable kernel. Borders up to half-kernel width are ignored
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1, CV_16SC1 or CV_32FC1
+ * @param ddepth The depth of output image
+ * @param _kernel Filer kernel data
+ *
+ * @sa Filter2D
+ */
+CV_EXPORTS_W void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel);
+
+/**
+ * @brief NxN correlation with separable kernel. If srcImg and dstImg point to the same address and srcStride equals to dstStride,
+ *        it will do in-place. Borders up to half-kernel width are ignored.
+ *        The way of handling overflow is different with OpenCV, this function will do right shift for
+ *        the intermediate results and final result.
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1, CV_16SC1
+ * @param ddepth The depth of output image
+ * @param _kernelX Filer kernel data in x direction
+ * @param _kernelY Filer kernel data in Y direction (For CV_16SC1, the kernelX and kernelY should be same)
+ *
+ * @sa sepFilter2D
+ */
+CV_EXPORTS_W void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY);
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_BLUR_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/cluster.hpp b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
index f90deeae465..46ac7ad103d 100644
--- a/modules/fastcv/include/opencv2/fastcv/cluster.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Clusterizes N input points in D-dimensional space into K clusters
- * 
+ *        Accepts 8-bit unsigned integer points
+ *        Provides faster execution time than cv::kmeans on Qualcomm's processors
  * @param points            Points array of type 8u, each row represets a point.
  *                          Size is N rows by D columns, can be non-continuous.
  * @param clusterCenters    Initial cluster centers array of type 32f, each row represents a center.
diff --git a/modules/fastcv/include/opencv2/fastcv/draw.hpp b/modules/fastcv/include/opencv2/fastcv/draw.hpp
index baa2b58c930..1abb5f55080 100644
--- a/modules/fastcv/include/opencv2/fastcv/draw.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/draw.hpp
@@ -17,7 +17,7 @@ namespace fastcv {
 /**
  * @brief Draw convex polygon
           This function fills the interior of a convex polygon with the specified color.
-
+          Requires the width and stride to be multple of 8.
  * @param img Image to draw on. Should have up to 4 8-bit channels
  * @param pts Array of polygon points coordinates. Should contain N two-channel or 2*N one-channel 32-bit integer elements
  * @param color Color of drawn polygon stored as B,G,R and A(if supported)
diff --git a/modules/fastcv/include/opencv2/fastcv/edges.hpp b/modules/fastcv/include/opencv2/fastcv/edges.hpp
new file mode 100644
index 00000000000..dd2677bf415
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/edges.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_EDGES_HPP
+#define OPENCV_EDGES_HPP
+
+#include "opencv2/core/mat.hpp"
+
+namespace cv {
+namespace fastcv {
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ */
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Creates a 2D gradient image from source luminance data without normalization.
+ *        Calculate X direction 1 order derivative or Y direction 1 order derivative or both at the same time, .
+ * @param _src          Input image with type CV_8UC1
+ * @param _dx           Buffer to store horizontal gradient. Must be (dxyStride)*(height) bytes in size.
+ *                      If NULL, the horizontal gradient will not be calculated.
+ * @param _dy           Buffer to store vertical gradient. Must be (dxyStride)*(height) bytes in size.
+ *                      If NULL, the vertical gradient will not be calculated
+ * @param kernel_size   Sobel kernel size, support 3x3, 5x5, 7x7
+ * @param borderType    Border type, support BORDER_CONSTANT, BORDER_REPLICATE
+ * @param borderValue   Border value for constant border
+*/
+CV_EXPORTS_W void sobel(InputArray _src, OutputArray _dx, OutputArray _dy, int kernel_size, int borderType, int borderValue);
+
+/**
+ * @brief Creates a 2D gradient image from source luminance data without normalization.
+ *        This function computes central differences on 3x3 neighborhood and then convolves the result with Sobel kernel,
+ *        borders up to half-kernel width are ignored.
+ * @param _src          Input image with type CV_8UC1
+ * @param _dst          If _dsty is given, buffer to store horizontal gradient, otherwise, output 8-bit image of |dx|+|dy|.
+ *                      Size of buffer is (srcwidth)*(srcheight) bytes
+ * @param _dsty         (Optional)Buffer to store vertical gradient. Must be (srcwidth)*(srcheight) in size.
+ * @param ddepth        The depth of output image CV_8SC1,CV_16SC1,CV_32FC1,
+ * @param normalization If do normalization for the result
+*/
+CV_EXPORTS_W void sobel3x3u8(InputArray _src, OutputArray _dst, OutputArray _dsty = noArray(), int ddepth = CV_8U,
+    bool normalization = false);
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/modules/fastcv/include/opencv2/fastcv/fast10.hpp b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
index 1d97e9d0df7..1dd15ac198c 100644
--- a/modules/fastcv/include/opencv2/fastcv/fast10.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
@@ -15,9 +15,10 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Extracts FAST corners and scores from the image based on the mask.
-          The mask specifies pixels to be ignored by the detector
-
+ * @brief Extracts FAST10 corners and scores from the image based on the mask.
+ *        The mask specifies pixels to be ignored by the detector
+ *        designed for corner detection on Qualcomm's processors, provides enhanced speed.
+ *
  * @param src 8-bit grayscale image
  * @param mask Optional mask indicating which pixels should be omited from corner dection.
                Its size should be k times image width and height, where k = 1/2, 1/4 , 1/8 , 1, 2, 4 and 8
diff --git a/modules/fastcv/include/opencv2/fastcv/fft.hpp b/modules/fastcv/include/opencv2/fastcv/fft.hpp
index 88901a6a4f8..1aef585035b 100644
--- a/modules/fastcv/include/opencv2/fastcv/fft.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/fft.hpp
@@ -18,7 +18,7 @@ namespace fastcv {
  * @brief Computes the 1D or 2D Fast Fourier Transform of a real valued matrix.
           For the 2D case, the width and height of the input and output matrix must be powers of 2.
           For the 1D case, the height of the matrices must be 1, while the width must be a power of 2.
-
+          Accepts 8-bit unsigned integer array, whereas cv::dft accepts floating-point or complex array.
  * @param src Input array of CV_8UC1. The dimensions of the matrix must be powers of 2 for the 2D case,
               and in the 1D case, the height must be 1, while the width must be a power of 2.
  * @param dst The computed FFT matrix of type CV_32FC2. The FFT Re and Im coefficients are stored in different channels.
diff --git a/modules/fastcv/include/opencv2/fastcv/hough.hpp b/modules/fastcv/include/opencv2/fastcv/hough.hpp
index 74f78a10841..e43323903cb 100644
--- a/modules/fastcv/include/opencv2/fastcv/hough.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/hough.hpp
@@ -16,7 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Performs Hough Line detection
- * 
+ *
  * @param src Input 8-bit image containing binary contour. Width and step should be divisible by 8
  * @param lines Output array containing detected lines in a form of (x1, y1, x2, y2) where all numbers are 32-bit floats
  * @param threshold Controls the minimal length of a detected line. Value must be between 0.0 and 1.0
@@ -25,6 +25,27 @@ namespace fastcv {
  */
 CV_EXPORTS_W void houghLines(InputArray src, OutputArray lines, double threshold = 0.25);
 
+
+/**
+ * @brief Finds circles in a grayscale image using Hough transform.
+ *        The radius of circle varies from 0 to max(srcWidth, srcHeight).
+ *
+ * @param src Input 8-bit image containing binary contour. Step should be divisible by 8, data start should be 128-bit aligned
+ * @param circles Output array containing detected circles in a form (x, y, r) where all numbers are 32-bit integers
+ * @param minDist Minimum distance between the centers of the detected circles
+ * @param cannyThreshold The higher threshold of the two passed to the Canny() edge detector
+ *                       (the lower one is twice smaller). Default is 100.
+ * @param accThreshold The accumulator threshold for the circle centers at the detection
+ *                     stage. The smaller it is, the more false circles may be detected.
+ *                     Circles, corresponding to the larger accumulator values, will be
+ *                     returned first. Default is 100.
+ * @param minRadius Minimum circle radius, default is 0
+ * @param maxRadius Maximum circle radius, default is 0
+ */
+CV_EXPORTS_W void houghCircles(InputArray src, OutputArray circles, uint32_t minDist,
+                               uint32_t cannyThreshold = 100, uint32_t accThreshold = 100,
+                               uint32_t minRadius = 0, uint32_t maxRadius = 0);
+
 //! @}
 
 } // fastcv::
diff --git a/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
new file mode 100644
index 00000000000..42c8c94ea78
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_IPPTRANSFORM_HPP
+#define OPENCV_FASTCV_IPPTRANSFORM_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief This function performs 8x8 forward discrete Cosine transform on input image
+ * 		  accepts input of type 8-bit unsigned integer and produces output of type 16-bit signed integer
+ *		  provides faster execution time than cv::dct on Qualcomm's processor
+ * @param src Input image of type CV_8UC1
+ * @param dst Output image of type CV_16SC1
+ */
+CV_EXPORTS_W void DCT(InputArray src, OutputArray dst);
+
+/**
+ * @brief This function performs 8x8 inverse discrete Cosine transform on input image
+ * provides faster execution time than cv::dct in inverse case on Qualcomm's processor
+ * @param src Input image of type CV_16SC1
+ * @param dst Output image of type CV_8UC1
+ */
+CV_EXPORTS_W void IDCT(InputArray src, OutputArray dst);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_IPPTRANSFORM_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/moments.hpp b/modules/fastcv/include/opencv2/fastcv/moments.hpp
index 3cffa62f767..90034548571 100644
--- a/modules/fastcv/include/opencv2/fastcv/moments.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/moments.hpp
@@ -17,8 +17,9 @@ namespace fastcv {
 /**
  * @brief Calculates all of the moments up to the third order of the image pixels' intensities
           The results are returned in the structure cv::Moments.
- * @param _src Input image with type CV_8UC1, CV_32SC1, CV_32FC1
- * @param binary If 1, binary image (0x00-black, oxff-white); if 0, grayscale image
+ * @param _src      Input image with type CV_8UC1, CV_32SC1, CV_32FC1
+ * @param binary    If true, assumes the image to be binary (0x00 for black, 0xff for white), otherwise assumes the image to be
+ *                  grayscale.
  */
 CV_EXPORTS cv::Moments moments(InputArray _src, bool binary);
 
diff --git a/modules/fastcv/include/opencv2/fastcv/mser.hpp b/modules/fastcv/include/opencv2/fastcv/mser.hpp
index 78282b66fdd..249c0e14e2b 100644
--- a/modules/fastcv/include/opencv2/fastcv/mser.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/mser.hpp
@@ -15,107 +15,98 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Structure containing additional information about found contour
+ * @brief MSER blob detector for grayscale images
  *
  */
-struct ContourData
+class CV_EXPORTS_W FCVMSER
 {
-    uint32_t variation;   //!< Variation of a contour from previous grey level
-    int32_t  polarity;    //!< Polarity for a contour. This value is 1 if this is a MSER+ region, -1 if this is a MSER- region.
-    uint32_t nodeId;      //!< Node ID for a contour
-    uint32_t nodeCounter; //!< Node counter for a contour
-};
+public:
 
-/**
- * @brief This is an overload for MSER() function
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief Structure containing additional information about found contour
+     *
+     */
+    struct ContourData
+    {
+        uint32_t variation;   //!< Variation of a contour from previous grey level
+        int32_t  polarity;    //!< Polarity for a contour. This value is 1 if this is a MSER+ region, -1 if this is a MSER- region.
+        uint32_t nodeId;      //!< Node ID for a contour
+        uint32_t nodeCounter; //!< Node counter for a contour
+    };
 
-/**
- * @brief This is an overload for MSER() function
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param boundingBoxes Array containing bounding boxes of found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief Creates MSER detector
+     *
+     * @param imgSize Image size. Image width has to be greater than 50, and image height has to be greater than 5.
+     * @param numNeighbors Number of neighbors in contours, can be 4 or 8
+     * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
+                    within which the region is stable ).
+                    Typical value range [0.8 8], typical value 2
+     * @param minArea Minimum area (number of pixels) of a mser contour.
+                      Typical value range [10 50], typical value 30
+     * @param maxArea Maximum area (number of pixels) of a  mser contour.
+                      Typical value 14400 or 0.25*width*height
+     * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
+                           Typical value range [0.1 1.0], typical value 0.15
+     * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
+                           Typical value range [0.1 1.0], typical value 0.2
+     * @return Feature detector object ready for detection
+     */
+    CV_WRAP static Ptr<FCVMSER> create( const cv::Size& imgSize,
+                                        int numNeighbors = 4,
+                                        int delta = 2,
+                                        int minArea = 30,
+                                        int maxArea = 14400,
+                                        float maxVariation = 0.15f,
+                                        float minDiversity = 0.2f);
 
-/**
- * @brief Runs MSER blob detector on the grayscale image
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param boundingBoxes Array containing bounding boxes of found contours
- * @param contourData Array containing additional information about found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                       std::vector<ContourData>& contourData,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief This is an overload for detect() function
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    */
+    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours) = 0;
+
+    /**
+     * @brief This is an overload for detect() function
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    * @param boundingBoxes Array containing bounding boxes of found contours
+    */
+    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) = 0;
+
+    /**
+     * @brief Runs MSER blob detector on the grayscale image
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    * @param boundingBoxes Array containing bounding boxes of found contours
+    * @param contourData Array containing additional information about found contours
+    */
+    virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                        std::vector<ContourData>& contourData) = 0;
+
+    CV_WRAP virtual cv::Size getImgSize() = 0;
+    CV_WRAP virtual int getNumNeighbors() = 0;
+    CV_WRAP virtual int getDelta()        = 0;
+    CV_WRAP virtual int getMinArea()      = 0;
+    CV_WRAP virtual int getMaxArea()      = 0;
+    CV_WRAP virtual float getMaxVariation() = 0;
+    CV_WRAP virtual float getMinDiversity() = 0;
+
+    virtual ~FCVMSER() {}
+};
 
 //! @}
 
diff --git a/modules/fastcv/include/opencv2/fastcv/pyramid.hpp b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
new file mode 100644
index 00000000000..6c20a21ab78
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_PYRAMID_HPP
+#define OPENCV_FASTCV_PYRAMID_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Creates a gradient pyramid from an image pyramid
+ *
+ * @param pyr Input pyramid of 1-channel 8-bit images. Only continuous images are supported.
+ * @param dx Horizontal Sobel gradient pyramid of the same size as pyr
+ * @param dy Verical Sobel gradient pyramid of the same size as pyr
+ * @param outType Type of output data, can be CV_8S, CV_16S or CV_32F
+ */
+CV_EXPORTS_W void sobelPyramid(InputArrayOfArrays pyr, OutputArrayOfArrays dx, OutputArrayOfArrays dy, int outType = CV_8S);
+
+/**
+ * @brief Builds an image pyramid of float32 arising from a single
+    original image - that are successively downscaled w.r.t. the
+    pre-set levels. This API supports both ORB scaling and scale down by half. 
+ *
+ * @param src Input single-channel image of type 8U or 32F
+ * @param pyr Output array containing nLevels downscaled image copies
+ * @param nLevels Number of pyramid levels to produce
+ * @param scaleBy2 to scale images 2x down or by a factor of 1/(2)^(1/4) which is approximated as 0.8408964 (ORB downscaling),
+ *                 ORB scaling is not supported for float point images
+ * @param borderType how to process border, the options are BORDER_REFLECT (maps to FASTCV_BORDER_REFLECT),
+ *                   BORDER_REFLECT_101 (maps to FASTCV_BORDER_REFLECT_V2) and BORDER_REPLICATE (maps to FASTCV_BORDER_REPLICATE).
+ *                   Other border types are mapped to FASTCV_BORDER_UNDEFINED(border pixels are ignored). Currently, borders only
+ *                   supported for downscaling by half, ignored for ORB scaling. Also ignored for float point images
+ * @param borderValue what value should be used to fill border, ignored for float point images
+ */
+CV_EXPORTS_W void buildPyramid(InputArray src, OutputArrayOfArrays pyr, int nLevels, bool scaleBy2 = true,
+                               int borderType = cv::BORDER_REFLECT, uint8_t borderValue = 0);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_PYRAMID_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/scale.hpp b/modules/fastcv/include/opencv2/fastcv/scale.hpp
index 8d7d084ac24..276b2304050 100644
--- a/modules/fastcv/include/opencv2/fastcv/scale.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/scale.hpp
@@ -16,6 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Down-scale the image by averaging each 2x2 pixel block.
+ * 		  This function is not bit-exact with cv::resize but provides faster execution time on Qualcomm's processor.
  * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 2
  * @param _dst The output image data, type CV_8UC1
 */
@@ -23,6 +24,7 @@ CV_EXPORTS_W void resizeDownBy2(cv::InputArray _src, cv::OutputArray _dst);
 
 /**
  * @brief Down-scale the image by averaging each 4x4 pixel block.
+ * 		  This function is not bit-exact with cv::resize but provides faster execution time on Qualcomm's processor.
  * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 4
  * @param _dst The output image data, type CV_8UC1
 */
diff --git a/modules/fastcv/include/opencv2/fastcv/shift.hpp b/modules/fastcv/include/opencv2/fastcv/shift.hpp
index a545789f199..3ca2c22f2fc 100644
--- a/modules/fastcv/include/opencv2/fastcv/shift.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/shift.hpp
@@ -18,9 +18,12 @@ namespace fastcv {
  * @brief Applies the meanshift procedure and obtains the final converged position.
           This function applies the meanshift procedure to an original image (usually a probability image)
           and obtains the final converged position. The converged position search will stop either it has reached
-          the required accuracy or the maximum number of iterations.
+          the required accuracy or the maximum number of iterations. Moments used in the algorithm are calculated
+          in floating point.
+          This function isn't bit-exact with cv::meanShift but provides improved latency on Snapdragon processors.
 
- * @param src 8-bit grayscale image which is usually a probability image computed based on object histogram
+ * @param src 8-bit, 32-bit int or 32-bit float grayscale image which is usually a probability image
+ *            computed based on object histogram
  * @param rect Initial search window position which also returns the final converged window position
  * @param termCrit The criteria used to finish the MeanShift which consists of two termination criteria:
  *                 1) epsilon: required accuracy; 2) max_iter: maximum number of iterations
diff --git a/modules/fastcv/include/opencv2/fastcv/smooth.hpp b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
index a3cee45a3ce..2127ae5a23d 100644
--- a/modules/fastcv/include/opencv2/fastcv/smooth.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
@@ -20,6 +20,7 @@ namespace fastcv {
 Different from traditional bilateral filtering, here the smoothing is actually performed in gradient domain.
 The algorithm claims that it's more efficient than the original bilateral filtering in both image quality and computation.
 See algorithm description in the paper Recursive Bilateral Filtering, ECCV2012 by Prof Yang Qingxiong
+This function isn't bit-exact with cv::bilateralFilter but provides improved latency on Snapdragon processors.
  * @param src Input image, should have one CV_8U channel
  * @param dst Output array having one CV_8U channel
  * @param sigmaColor Sigma in the color space, the bigger the value the more color difference is smoothed by the algorithm
diff --git a/modules/fastcv/include/opencv2/fastcv/thresh.hpp b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
index 878761d75d5..418f98a012d 100644
--- a/modules/fastcv/include/opencv2/fastcv/thresh.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
@@ -17,7 +17,7 @@ namespace fastcv {
 /**
  * @brief Binarizes a grayscale image based on a pair of threshold values. The binarized image will be in the two values
  *        selected by user
-
+ *        this function provides improved latency on Snapdragon processor.
  * @param src 8-bit grayscale image
  * @param dst Output image of the same size and type as input image, can be the same as input image
  * @param lowThresh The lower threshold value for binarization
diff --git a/modules/fastcv/include/opencv2/fastcv/tracking.hpp b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
new file mode 100644
index 00000000000..9cca92c1239
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_TRACKING_HPP
+#define OPENCV_FASTCV_TRACKING_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Calculates sparse optical flow using Lucas-Kanade algorithm
+ *		  accepts 8-bit unsigned integer image
+ *		  Provides faster execution time on Qualcomm's processor 
+ * @param src Input single-channel image of type 8U, initial motion frame
+ * @param dst Input single-channel image of type 8U, final motion frame, should have the same size and stride as initial frame
+ * @param srcPyr Pyramid built from intial motion frame
+ * @param dstPyr Pyramid built from final motion frame
+ * @param ptsIn Array of initial subpixel coordinates of starting points, should contain 32F 2D elements
+ * @param ptsOut Output array of calculated final points, should contain 32F 2D elements
+ * @param ptsEst Input array of estimations for final points, should contain 32F 2D elements, can be empty
+ * @param statusVec Output array of int32 values indicating status of each feature, can be empty
+ * @param winSize Size of window for optical flow searching. Width and height ust be odd numbers. Suggested values are 5, 7 or 9
+ * @param termCriteria Termination criteria containing max number of iterations, max epsilon and stop condition
+ */
+void trackOpticalFlowLK(InputArray src, InputArray dst,
+                        InputArrayOfArrays srcPyr, InputArrayOfArrays dstPyr,
+                        InputArray ptsIn, OutputArray ptsOut, InputArray ptsEst,
+                        OutputArray statusVec, cv::Size winSize = {7, 7},
+                        cv::TermCriteria termCriteria = {cv::TermCriteria::MAX_ITER | cv::TermCriteria::EPS,
+                                                         /* maxIterations */ 7,
+                                                         /* maxEpsilon */ 0.03f * 0.03f});
+
+/**
+ * @brief Overload for v1 of the LK tracking function
+ *
+ * @param src Input single-channel image of type 8U, initial motion frame
+ * @param dst Input single-channel image of type 8U, final motion frame, should have the same size and stride as initial frame
+ * @param srcPyr Pyramid built from intial motion frame
+ * @param dstPyr Pyramid built from final motion frame
+ * @param srcDxPyr Pyramid of Sobel derivative by X of srcPyr
+ * @param srcDyPyr Pyramid of Sobel derivative by Y of srcPyr
+ * @param ptsIn Array of initial subpixel coordinates of starting points, should contain 32F 2D elements
+ * @param ptsOut Output array of calculated final points, should contain 32F 2D elements
+ * @param statusVec Output array of int32 values indicating status of each feature, can be empty
+ * @param winSize Size of window for optical flow searching. Width and height ust be odd numbers. Suggested values are 5, 7 or 9
+ * @param maxIterations Maximum number of iterations to try
+ */
+void trackOpticalFlowLK(InputArray src, InputArray dst,
+                        InputArrayOfArrays srcPyr, InputArrayOfArrays dstPyr,
+                        InputArrayOfArrays srcDxPyr, InputArrayOfArrays srcDyPyr,
+                        InputArray ptsIn, OutputArray ptsOut,
+                        OutputArray statusVec, cv::Size winSize = {7, 7}, int maxIterations = 7);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_TRACKING_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/warp.hpp b/modules/fastcv/include/opencv2/fastcv/warp.hpp
new file mode 100644
index 00000000000..8f58cd36577
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/warp.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_WARP_HPP
+#define OPENCV_WARP_HPP
+
+#include <opencv2/imgproc.hpp>
+namespace cv {
+namespace fastcv {
+
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+*/
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Perspective warp two images using the same transformation. Bi-linear interpolation is used where applicable.
+ *        For example, to warp a grayscale image and an alpha image at the same time, or warp two color channels.
+ * @param _src1     First input 8-bit image. Size of buffer is src1Stride*srcHeight bytes.
+ * @param _src2     Second input 8-bit image. Size of buffer is src2Stride*srcHeight bytes.
+ * @param _dst1     First warped output image (correspond to src1). Size of buffer is dst1Stride*dstHeight bytes, type CV_8UC1
+ * @param _dst2     Second warped output image (correspond to src2). Size of buffer is dst2Stride*dstHeight bytes, type CV_8UC1
+ * @param _M0       The 3x3 perspective transformation matrix (inversed map)
+ * @param dsize     The output image size
+*/
+CV_EXPORTS_W void warpPerspective2Plane(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2,
+    InputArray _M0, Size dsize);
+
+//! @}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_bilateral.cpp b/modules/fastcv/perf/perf_bilateral.cpp
index bb985da391d..63323d459cc 100644
--- a/modules/fastcv/perf/perf_bilateral.cpp
+++ b/modules/fastcv/perf/perf_bilateral.cpp
@@ -7,10 +7,10 @@
 
 namespace opencv_test {
 
-typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/> BilateralPerfParams;
-typedef perf::TestBaseWithParam<BilateralPerfParams> BilateralPerfTest;
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/> BilateralRecursivePerfParams;
+typedef perf::TestBaseWithParam<BilateralRecursivePerfParams> BilateralRecursivePerfTest;
 
-PERF_TEST_P(BilateralPerfTest, run,
+PERF_TEST_P(BilateralRecursivePerfTest, run,
     ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
                        ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f))
            )
@@ -32,14 +32,15 @@ PERF_TEST_P(BilateralPerfTest, run,
     SANITY_CHECK_NOTHING();
 }
 
-typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/, cv::Size, int > BilateralPerfParams2;
-typedef perf::TestBaseWithParam<BilateralPerfParams2> BilateralPerfTest2;
 
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/, cv::Size, int > BilateralPerfParams;
+typedef perf::TestBaseWithParam<BilateralPerfParams> BilateralPerfTest;
 
-PERF_TEST_P(BilateralPerfTest2, run,
+
+PERF_TEST_P(BilateralPerfTest, run,
     ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
                        ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f),
-					   ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
+                       ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
                        ::testing::Values(5, 7, 9))
            )
 {
@@ -47,17 +48,17 @@ PERF_TEST_P(BilateralPerfTest2, run,
     float sigmaColor = std::get<0>(p);
     float sigmaSpace = std::get<1>(p);
     cv::Size size  = std::get<2>(p);
-	int d = get<3>(p);
+    int d = get<3>(p);
 
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
     Mat dst;
 
-    for (;  next(); )
+    while (next())
     {
         startTimer();
-		cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+        cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
         stopTimer();
     }
 
diff --git a/modules/fastcv/perf/perf_blur.cpp b/modules/fastcv/perf/perf_blur.cpp
new file mode 100644
index 00000000000..bca8f80974a
--- /dev/null
+++ b/modules/fastcv/perf/perf_blur.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int, bool>> GaussianBlurPerfTest;
+
+PERF_TEST_P(GaussianBlurPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S,CV_32S),                      // image depth
+                       ::testing::Values(3, 5),                                     // kernel size
+                       ::testing::Values(true,false)                                // blur border
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+    bool border = get<3>(GetParam());
+
+    // For some cases FastCV not support, so skip them
+    if((ksize!=5) && (depth!=CV_8U))
+        throw ::perf::TestBase::PerfSkipTestException();
+
+    cv::Mat src(srcSize, depth);
+    cv::Mat dst;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::gaussianBlur(src, dst, ksize, border);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> Filter2DPerfTest;
+
+PERF_TEST_P(Filter2DPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S,CV_32F),                      // dst image depth
+                       ::testing::Values(3, 5, 7, 9, 11)                            // kernel size
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel;
+    cv::Mat dst;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            kernel.create(ksize,ksize,CV_8S);
+            break;
+        }
+        case CV_32F:
+        {
+            kernel.create(ksize,ksize,CV_32F);
+            break;
+        }
+        default:
+            break;
+    }
+
+    cv::randu(src, 0, 256);
+    cv::randu(kernel, INT8_MIN, INT8_MAX);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::filter2D(src, dst, ddepth, kernel);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> SepFilter2DPerfTest;
+
+PERF_TEST_P(SepFilter2DPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S),                             // dst image depth
+                       ::testing::Values(3, 5, 7, 9, 11, 13, 15, 17)                // kernel size
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+
+    cv::Mat src(srcSize, ddepth);
+    cv::Mat kernel(1, ksize, ddepth);
+    cv::Mat dst;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sepFilter2D(src, dst, ddepth, kernel, kernel);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_edges.cpp b/modules/fastcv/perf/perf_edges.cpp
new file mode 100644
index 00000000000..74ffa552124
--- /dev/null
+++ b/modules/fastcv/perf/perf_edges.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int, int>> SobelPerfTest;
+
+PERF_TEST_P(SobelPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(3,5,7),                                    // kernel size
+                       ::testing::Values(BORDER_CONSTANT, BORDER_REPLICATE),        // border type
+                       ::testing::Values(0)                                         // border value
+                       )
+           )
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize = get<1>(GetParam());
+    int border = get<2>(GetParam());
+    int borderValue = get<3>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sobel(src,dx,dy,ksize,border,borderValue);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> Sobel3x3u8PerfTest;
+
+PERF_TEST_P(Sobel3x3u8PerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8S, CV_16S, CV_32F),                    // image depth
+                       ::testing::Values(0, 1)                                      // normalization
+                       )
+           )
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int normalization = get<2>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    if((normalization ==0) && (ddepth == CV_8S))
+        throw ::perf::TestBase::PerfSkipTestException();
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sobel3x3u8(src, dx, dy, ddepth, normalization);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+} //namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_fft_dct.cpp b/modules/fastcv/perf/perf_fft_dct.cpp
new file mode 100644
index 00000000000..30e4e68ce62
--- /dev/null
+++ b/modules/fastcv/perf/perf_fft_dct.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<cv::Size> FFTExtPerfTest;
+
+PERF_TEST_P_(FFTExtPerfTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat dst;
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::FFT(src, dst);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FFTExtPerfTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat fwd, back;
+    cv::fastcv::FFT(src, fwd);
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::IFFT(fwd, back);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, FFTExtPerfTest,
+    ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512),
+                      Size(32, 1), Size(512, 1)));
+
+/// DCT ///
+
+typedef perf::TestBaseWithParam<cv::Size> DCTExtPerfTest;
+
+PERF_TEST_P_(DCTExtPerfTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat dst, ref;
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::DCT(src, dst);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(DCTExtPerfTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat fwd, back;
+    cv::fastcv::DCT(src, fwd);
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::IDCT(fwd, back);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, DCTExtPerfTest,
+    ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512)));
+} // namespace
diff --git a/modules/fastcv/perf/perf_mser.cpp b/modules/fastcv/perf/perf_mser.cpp
index 4e1a6ce80af..36f876cd045 100644
--- a/modules/fastcv/perf/perf_mser.cpp
+++ b/modules/fastcv/perf/perf_mser.cpp
@@ -30,36 +30,37 @@ PERF_TEST_P(MSERPerfTest, run,
 
     cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
 
-    unsigned int delta = 2;
-    unsigned int minArea = 256;
-    unsigned int maxArea = (int)src.total()/4;
+    uint32_t delta = 2;
+    uint32_t minArea = 256;
+    uint32_t maxArea = (int)src.total()/4;
     float        maxVariation = 0.15f;
     float        minDiversity = 0.2f;
 
+    cv::Ptr<cv::fastcv::FCVMSER> mser;
+    mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+                                       maxVariation, minDiversity);
+
     while(next())
     {
         std::vector<std::vector<Point>> contours;
         std::vector<cv::Rect> bboxes;
-        std::vector<cv::fastcv::ContourData> contourData;
+        std::vector<cv::fastcv::FCVMSER::ContourData> contourData;
 
         startTimer();
         if (useBboxes)
         {
             if (useContourData)
             {
-                cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
-                                 delta, minArea, maxArea, maxVariation, minDiversity);
+                mser->detect(src, contours, bboxes, contourData);
             }
             else
             {
-                cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
-                                 delta, minArea, maxArea, maxVariation, minDiversity);
+                mser->detect(src, contours, bboxes);
             }
         }
         else
         {
-            cv::fastcv::MSER(src, contours, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours);
         }
         stopTimer();
     }
diff --git a/modules/fastcv/perf/perf_pyramid.cpp b/modules/fastcv/perf/perf_pyramid.cpp
new file mode 100644
index 00000000000..27c0fae8d59
--- /dev/null
+++ b/modules/fastcv/perf/perf_pyramid.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<bool /*useFloat*/, int /*nLevels*/, bool /*scaleBy2*/> PyramidTestParams;
+class PyramidTest : public ::perf::TestBaseWithParam<PyramidTestParams> { };
+
+PERF_TEST_P(PyramidTest, checkAllVersions, // version, useFloat, nLevels
+                        ::testing::Values(
+                            PyramidTestParams { true, 2,  true}, PyramidTestParams { true, 3,  true}, PyramidTestParams { true, 4,  true},
+                            PyramidTestParams {false, 2,  true}, PyramidTestParams {false, 3,  true}, PyramidTestParams {false, 4,  true},
+                            PyramidTestParams {false, 2, false}, PyramidTestParams {false, 3, false}, PyramidTestParams {false, 4, false}
+                            ))
+{
+    auto par = GetParam();
+
+    bool useFloat = std::get<0>(par);
+    int  nLevels  = std::get<1>(par);
+    bool scaleBy2 = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    if (useFloat)
+    {
+        cv::Mat f;
+        src.convertTo(f, CV_32F);
+        src = f;
+    }
+
+    while(next())
+    {
+        std::vector<cv::Mat> pyr;
+        startTimer();
+        cv::fastcv::buildPyramid(src, pyr, nLevels, scaleBy2);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+
+typedef std::tuple<MatType, size_t> SobelPyramidTestParams;
+class SobelPyramidTest : public ::perf::TestBaseWithParam<SobelPyramidTestParams> {};
+
+PERF_TEST_P(SobelPyramidTest, checkAllTypes,
+    ::testing::Combine(::testing::Values(CV_8S, CV_16S, CV_32F),
+                       ::testing::Values(3, 6)))
+{
+    auto p = GetParam();
+    int    type    = std::get<0>(p);
+    size_t nLevels = std::get<1>(p);
+
+    // NOTE: test files should be manually loaded to folder on a device, for example like this:
+    // adb push fastcv/misc/bilateral_recursive/ /sdcard/testdata/fastcv/bilateral/
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels);
+
+    while(next())
+    {
+        std::vector<cv::Mat> pyrDx, pyrDy;
+        startTimer();
+        cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_tracking.cpp b/modules/fastcv/perf/perf_tracking.cpp
new file mode 100644
index 00000000000..fc5d10eccdf
--- /dev/null
+++ b/modules/fastcv/perf/perf_tracking.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<int /*winSize*/, bool /*useSobelPyramid*/, bool /*useInitialEstimate*/ > TrackingTestParams;
+class TrackingTest : public ::perf::TestBaseWithParam<TrackingTestParams> {};
+
+PERF_TEST_P(TrackingTest, checkAllVersions,
+    ::testing::Combine(::testing::Values(5, 7, 9), // window size
+                       ::testing::Bool(),          // useSobelPyramid
+                       ::testing::Bool()           // useInitialEstimate
+                      ))
+{
+    auto par = GetParam();
+
+    int winSz               = std::get<0>(par);
+    bool useSobelPyramid    = std::get<1>(par);
+    bool useInitialEstimate = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    double ang = 5.0 * CV_PI / 180.0;
+    cv::Matx33d tr = {
+        cos(ang), -sin(ang), 1,
+        sin(ang),  cos(ang), 2,
+               0,         0, 1
+    };
+    cv::Matx33d orig {
+        1, 0, -(double)src.cols / 2,
+        0, 1, -(double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx33d back {
+        1, 0, (double)src.cols / 2,
+        0, 1, (double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx23d trans = (back * tr * orig).get_minor<2, 3>(0, 0);
+
+    cv::Mat dst;
+    cv::warpAffine(src, dst, trans, src.size());
+
+    int nLevels = 4;
+    std::vector<cv::Mat> srcPyr, dstPyr;
+
+    cv::buildPyramid(src, srcPyr, nLevels - 1);
+    cv::buildPyramid(dst, dstPyr, nLevels - 1);
+
+    cv::Matx23f transf = trans;
+    int nPts = 32;
+    std::vector<cv::Point2f> ptsIn, ptsEst, ptsExpected;
+    for (int i = 0; i < nPts; i++)
+    {
+        cv::Point2f p { (((float)cv::theRNG())*0.5f + 0.25f) * src.cols,
+                        (((float)cv::theRNG())*0.5f + 0.25f) * src.rows };
+        ptsIn.push_back(p);
+        ptsExpected.push_back(transf * cv::Vec3f(p.x, p.y, 1.0));
+        ptsEst.push_back(p);
+    }
+
+    cv::TermCriteria termCrit;
+    termCrit.type = cv::TermCriteria::COUNT | cv::TermCriteria::EPS;
+    termCrit.maxCount = 7;
+    termCrit.epsilon = 0.03f * 0.03f;
+
+    std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+    if (useSobelPyramid)
+    {
+        cv::fastcv::sobelPyramid(srcPyr, srcDxPyr, srcDyPyr, CV_8S);
+    }
+
+    while(next())
+    {
+        std::vector<int32_t> statusVec(nPts);
+        std::vector<cv::Point2f> ptsOut(nPts);
+        startTimer();
+        if (useSobelPyramid)
+        {
+            cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, srcDxPyr, srcDyPyr,
+                                           ptsIn, ptsOut, statusVec, {winSz, winSz});
+        }
+        else
+        {
+            cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, ptsIn, ptsOut, (useInitialEstimate ? ptsEst : noArray()),
+                                           statusVec, {winSz, winSz}, termCrit);
+        }
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_warp.cpp b/modules/fastcv/perf/perf_warp.cpp
new file mode 100644
index 00000000000..231056aef56
--- /dev/null
+++ b/modules/fastcv/perf/perf_warp.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<Size> WarpPerspective2PlanePerfTest;
+
+PERF_TEST_P(WarpPerspective2PlanePerfTest, run,
+    ::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p))
+{
+    cv::Size dstSize = GetParam();
+    cv::Mat img = imread(cvtest::findDataFile("cv/shared/baboon.png"));
+    Mat src(img.rows, img.cols, CV_8UC1);
+    cvtColor(img,src,cv::COLOR_BGR2GRAY);
+    cv::Mat dst1, dst2, mat;
+    mat.create(3,3,CV_32FC1);
+    dst1.create(dstSize,CV_8UC1);
+    dst2.create(dstSize,CV_8UC1);
+
+    RNG& rng = cv::theRNG();
+    Point2f s[4], d[4];
+
+    s[0] = Point2f(0,0);
+    d[0] = Point2f(0,0);
+    s[1] = Point2f(src.cols-1.f,0);
+    d[1] = Point2f(dst1.cols-1.f,0);
+    s[2] = Point2f(src.cols-1.f,src.rows-1.f);
+    d[2] = Point2f(dst1.cols-1.f,dst1.rows-1.f);
+    s[3] = Point2f(0,src.rows-1.f);
+    d[3] = Point2f(0,dst1.rows-1.f);
+
+    float buffer[16];
+    Mat tmp( 1, 16, CV_32FC1, buffer );
+    rng.fill( tmp, 1, Scalar::all(0.), Scalar::all(0.1) );
+
+    for(int i = 0; i < 4; i++ )
+    {
+        s[i].x += buffer[i*4]*src.cols/2;
+        s[i].y += buffer[i*4+1]*src.rows/2;
+        d[i].x += buffer[i*4+2]*dst1.cols/2;
+        d[i].y += buffer[i*4+3]*dst1.rows/2;
+    }
+
+    cv::getPerspectiveTransform( s, d ).convertTo( mat, mat.depth() );
+    // Invert the perspective matrix
+    invert(mat,mat);
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::warpPerspective2Plane(src, src, dst1, dst2, mat, dstSize);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} //namespace
\ No newline at end of file
diff --git a/modules/fastcv/src/bilateralFilter.cpp b/modules/fastcv/src/bilateralFilter.cpp
index 1cd0ece6b14..a0995347b24 100644
--- a/modules/fastcv/src/bilateralFilter.cpp
+++ b/modules/fastcv/src/bilateralFilter.cpp
@@ -12,54 +12,45 @@ class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
 {
 public:
 
-    FcvFilterLoop_Invoker(cv::Mat src_, size_t src_step_, cv::Mat dst_, size_t dst_step_, int width_, int height_,  int bdr_, int knl_, float32_t sigma_color_, float32_t sigma_space_) :
+    FcvFilterLoop_Invoker(cv::Mat src_, size_t src_step_, cv::Mat dst_, size_t dst_step_, int width_, int height_,
+                          int bdr_, int knl_, float32_t sigma_color_, float32_t sigma_space_) :
         cv::ParallelLoopBody(), src_step(src_step_), dst_step(dst_step_), width(width_), height(height_),
         bdr(bdr_), knl(knl_), sigma_color(sigma_color_), sigma_space(sigma_space_), src(src_), dst(dst_)
-    {
-    }
+    { }
 
     virtual void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-
-        fcvStatus status = FASTCV_SUCCESS;
-		int height_ = range.end - range.start;
+        int height_ = range.end - range.start;
         int width_  = width;
 		cv::Mat src_;
 		int n = knl/2;
 
-		if(range.start == 0 && range.end == height)
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
-		}
-		else if(range.start == 0)
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
-		}
-		else if(range.end == (height))
+        src_ = cv::Mat(height_ + 2 * n, width_ + 2 * n, CV_8U);
+        if (range.start == 0 && range.end == height)
         {
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
-		}
-		else
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2*n)), src_, 0, 0, n, n, bdr);
-		}
-
+            cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
+        }
+        else if (range.start == 0)
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
+        }
+        else if (range.end == (height))
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
+        }
+        else
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2 * n)), src_, 0, 0, n, n, bdr);
+        }
 
 		cv::Mat dst_padded = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
 
-		if(knl == 5)
-		    status = fcvBilateralFilter5x5u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
-		else if(knl == 7)
-		    status = fcvBilateralFilter7x7u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
-		else if(knl == 9)
-		    status = fcvBilateralFilter9x9u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
+        auto func = (knl == 5) ? fcvBilateralFilter5x5u8_v3 :
+                    (knl == 7) ? fcvBilateralFilter7x7u8_v3 :
+                    (knl == 9) ? fcvBilateralFilter9x9u8_v3 :
+                    nullptr;
+        func(src_.data, width_ + 2 * n, height_ + 2 * n, width_ + 2 * n,
+             dst_padded.data, width_ + 2 * n, sigma_color, sigma_space, 0);
 
 		cv::Mat dst_temp1 = dst_padded(cv::Rect(n, n, width_, height_));
 		cv::Mat dst_temp2 = dst(cv::Rect(0, range.start, width_, height_));
@@ -97,20 +88,21 @@ void bilateralFilter( InputArray _src, OutputArray _dst, int d,
     Size size = _src.size();
 	_dst.create( size, type );
     Mat src = _src.getMat();
-	Mat dst = _dst.getMat();
+    Mat dst = _dst.getMat();
+
+    CV_Assert(src.data != dst.data);
 
     if( sigmaColor <= 0 )
+	{
         sigmaColor = 1;
+	}
     if( sigmaSpace <= 0 )
+	{
         sigmaSpace = 1;
+	}
 
-	int nStripes = 1;
-	if(src.rows/20 == 0)
-		nStripes = 1;
-	else
-		nStripes = (src.rows/20);
-
-	cv::parallel_for_(cv::Range(0, src.rows),
+    int nStripes = (src.rows / 20 == 0) ? 1 : (src.rows / 20);
+    cv::parallel_for_(cv::Range(0, src.rows),
               FcvFilterLoop_Invoker(src, src.step, dst, dst.step, src.cols, src.rows, borderType, d, sigmaColor, sigmaSpace), nStripes);
 }
 
diff --git a/modules/fastcv/src/blur.cpp b/modules/fastcv/src/blur.cpp
new file mode 100644
index 00000000000..66058a37b5a
--- /dev/null
+++ b/modules/fastcv/src/blur.cpp
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvGaussianBlurLoop_Invoker : public ParallelLoopBody
+{
+    public:
+
+    FcvGaussianBlurLoop_Invoker(const Mat& _src, Mat& _dst, int _ksize, fcvBorderType _fcvBorder, int _fcvBorderValue) :
+        ParallelLoopBody(), src(_src),dst(_dst), ksize(_ksize), fcvBorder(_fcvBorder), fcvBorderValue(_fcvBorderValue)
+    {
+        width       = src.cols;
+        height      = src.rows;
+        halfKsize   = ksize / 2;
+        fcvFuncType = FCV_MAKETYPE(ksize, src.depth());
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        if(range.start != 0)
+        {
+            topLines     += halfKsize;
+            paddedHeight += halfKsize;
+        }
+
+        if(range.end != height)
+        {
+            paddedHeight += halfKsize;
+        }
+
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
+
+        if (fcvFuncType == FCV_MAKETYPE(3,CV_8U))
+            fcvFilterGaussian3x3u8_v4(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_8U))
+            fcvFilterGaussian5x5u8_v3(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_16S))
+            fcvFilterGaussian5x5s16_v3((int16_t*)srcPadded.data, width, paddedHeight, srcPadded.step, (int16_t*)dstPadded.data,
+                dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_32S))
+            fcvFilterGaussian5x5s32_v3((int32_t*)srcPadded.data, width, paddedHeight, srcPadded.step, (int32_t*)dstPadded.data,
+                dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(11,CV_8U))
+            fcvFilterGaussian11x11u8_v2(srcPadded.data, width, rangeHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder);
+
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const Mat&      src;
+    Mat&            dst;
+    int             width;
+    int             height;
+    const int       ksize;
+    int             halfKsize;
+    int             fcvFuncType;
+    fcvBorderType   fcvBorder;
+    int             fcvBorderValue;
+
+    FcvGaussianBlurLoop_Invoker(const FcvGaussianBlurLoop_Invoker &);  // = delete;
+    const FcvGaussianBlurLoop_Invoker& operator= (const FcvGaussianBlurLoop_Invoker &);  // = delete;
+};
+
+void gaussianBlur(InputArray _src, OutputArray _dst, int kernel_size, bool blur_border)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && CV_MAT_CN(_src.type()) == 1);
+
+    Size size = _src.size();
+    int type  = _src.type();
+    _dst.create( size, type );
+
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
+
+    fcvBorderType fcvBorder = blur_border ? FASTCV_BORDER_ZERO_PADDING : FASTCV_BORDER_UNDEFINED;
+
+    if (((type == CV_8UC1)  && ((kernel_size == 3) || (kernel_size == 5) || (kernel_size == 11)))  ||
+        ((type == CV_16SC1) && (kernel_size == 5)) ||
+        ((type == CV_32SC1) && (kernel_size == 5)))
+    {
+        parallel_for_(Range(0, src.rows), FcvGaussianBlurLoop_Invoker(src, dst, kernel_size, fcvBorder, 0), nStripes);
+    }
+    else
+        CV_Error(cv::Error::StsBadArg, cv::format("Src type %d, kernel size %d is not supported", type, kernel_size));
+}
+
+class FcvFilter2DLoop_Invoker : public ParallelLoopBody
+{
+    public:
+
+    FcvFilter2DLoop_Invoker(const Mat& _src, Mat& _dst, const Mat& _kernel) :
+        ParallelLoopBody(), src(_src), dst(_dst), kernel(_kernel)
+    {
+        width     = src.cols;
+        height    = src.rows;
+        ksize     = kernel.size().width;
+        halfKsize = ksize/2;
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        if(range.start >= halfKsize)
+        {
+            topLines    += halfKsize;
+            paddedHeight += halfKsize;
+        }
+
+        if(range.end <= height-halfKsize)
+        {
+            paddedHeight += halfKsize;
+        }
+
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
+
+        if (dst.depth() == CV_8U)
+            fcvFilterCorrNxNu8((int8_t*)kernel.data, ksize, 0, srcPadded.data, width, paddedHeight, srcPadded.step,
+                dstPadded.data, dstPadded.step);
+        else if (dst.depth() == CV_16S)
+            fcvFilterCorrNxNu8s16((int8_t*)kernel.data, ksize, 0, srcPadded.data, width, paddedHeight, srcPadded.step,
+                (int16_t*)dstPadded.data, dstPadded.step);
+        else if (dst.depth() == CV_32F)
+            fcvFilterCorrNxNu8f32((float32_t*)kernel.data, ksize, srcPadded.data, width, paddedHeight, srcPadded.step,
+                (float32_t*)dstPadded.data, dstPadded.step);
+
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const Mat&  src;
+    Mat&        dst;
+    const Mat&  kernel;
+    int         width;
+    int         height;
+    int         ksize;
+    int         halfKsize;
+
+    FcvFilter2DLoop_Invoker(const FcvFilter2DLoop_Invoker &);  // = delete;
+    const FcvFilter2DLoop_Invoker& operator= (const FcvFilter2DLoop_Invoker &);  // = delete;
+};
+
+void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Mat kernel = _kernel.getMat();
+    Size ksize = kernel.size();
+    CV_Assert(ksize.width == ksize.height);
+    CV_Assert(ksize.width % 2 == 1);
+
+    _dst.create(_src.size(), ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_8S);
+            parallel_for_(Range(0, src.rows), FcvFilter2DLoop_Invoker(src, dst, kernel), nStripes);
+            break;
+        }
+        case CV_32F:
+        {
+            CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_32F);
+            parallel_for_(Range(0, src.rows), FcvFilter2DLoop_Invoker(src, dst, kernel), nStripes);
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Kernel Size:%d, Dst type:%s is not supported", ksize.width,
+                depthToString(ddepth)));
+            break;
+        }
+    }
+}
+
+class FcvSepFilter2DLoop_Invoker : public ParallelLoopBody
+{
+    public:
+
+    FcvSepFilter2DLoop_Invoker(const Mat& _src, Mat& _dst, const Mat& _kernelX, const Mat& _kernelY) :
+        ParallelLoopBody(), src(_src), dst(_dst), kernelX(_kernelX), kernelY(_kernelY)
+    {
+        width       = src.cols;
+        height      = src.rows;
+        kernelXSize = kernelX.size().width;
+        kernelYSize = kernelY.size().width;
+        halfKsize   = kernelXSize/2;
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        if(range.start >= halfKsize)
+        {
+            topLines     += halfKsize;
+            paddedHeight += halfKsize;
+        }
+
+        if(range.end <= height-halfKsize)
+        {
+            paddedHeight += halfKsize;
+        }
+
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
+
+        switch (dst.depth())
+        {
+            case CV_8U:
+            {
+                fcvFilterCorrSepMxNu8((int8_t*)kernelX.data, kernelXSize, (int8_t*)kernelY.data, kernelYSize, 0, srcPadded.data,
+                    width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step);
+                break;
+            }
+            case CV_16S:
+            {
+                std::vector<int16_t> tmpImage(width * (paddedHeight + kernelXSize - 1));
+                switch (kernelXSize)
+                {
+                    case 9:
+                    {
+                        fcvFilterCorrSep9x9s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 11:
+                    {
+                        fcvFilterCorrSep11x11s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 13:
+                    {
+                        fcvFilterCorrSep13x13s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 15:
+                    {
+                        fcvFilterCorrSep15x15s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 17:
+                    {
+                        fcvFilterCorrSep17x17s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+
+                    default:
+                    {
+                        fcvFilterCorrSepNxNs16((int16_t*)kernelX.data, kernelXSize, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                }
+                break;
+            }
+            default:
+            {
+                CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(dst.depth())));
+                break;
+            }
+        }
+
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const Mat&  src;
+    Mat&        dst;
+    int         width;
+    int         height;
+    const Mat&  kernelX;
+    const Mat&  kernelY;
+    int         kernelXSize;
+    int         kernelYSize;
+    int         halfKsize;
+
+    FcvSepFilter2DLoop_Invoker(const FcvSepFilter2DLoop_Invoker &);  // = delete;
+    const FcvSepFilter2DLoop_Invoker& operator= (const FcvSepFilter2DLoop_Invoker &);  // = delete;
+};
+
+void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && (_src.type() == CV_8UC1 || _src.type() == CV_16SC1));
+    _dst.create(_src.size(), ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    Mat kernelX = _kernelX.getMat();
+    Mat kernelY = _kernelY.getMat();
+
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        {
+            cv::parallel_for_(cv::Range(0, src.rows), FcvSepFilter2DLoop_Invoker(src, dst, kernelX, kernelY), nStripes);
+            break;
+        }
+        case CV_16S:
+        {
+            CV_Assert(CV_MAT_DEPTH(src.type()) == CV_16S);
+            CV_Assert(kernelX.size() == kernelY.size());
+            // kernalX and kernelY shhould be same.
+            Mat diff;
+            absdiff(kernelX, kernelY, diff);
+            CV_Assert(countNonZero(diff) == 0);
+
+            cv::parallel_for_(cv::Range(0, src.rows), FcvSepFilter2DLoop_Invoker(src, dst, kernelX, kernelY), nStripes);
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(ddepth)));
+            break;
+        }
+    }
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/edges.cpp b/modules/fastcv/src/edges.cpp
new file mode 100644
index 00000000000..ad90b9e71ee
--- /dev/null
+++ b/modules/fastcv/src/edges.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void sobel3x3u8(cv::InputArray _src, cv::OutputArray _dst, cv::OutputArray _dsty, int ddepth, bool normalization)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Size size = _src.size();
+    _dst.create(size, ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    if (_dsty.needed())
+    {
+        _dsty.create(size, ddepth);
+        Mat dsty = _dsty.getMat();
+
+        switch(ddepth)
+        {
+            case CV_8S:
+                if (normalization)
+                    fcvImageGradientSobelPlanars8_v2(src.data, src.cols, src.rows, src.step, (int8_t*)dst.data,
+                        (int8_t*)dsty.data, dst.step);
+                else
+                    CV_Error(cv::Error::StsBadArg,
+                        cv::format("Depth: %d should do normalization, make sure the normalization parameter is true", ddepth));
+                break;
+            case CV_16S:
+                if (normalization)
+                    fcvImageGradientSobelPlanars16_v2(src.data, src.cols, src.rows, src.step, (int16_t*)dst.data,
+                        (int16_t*)dsty.data, dst.step);
+                else
+                    fcvImageGradientSobelPlanars16_v3(src.data, src.cols, src.rows, src.step, (int16_t*)dst.data,
+                        (int16_t*)dsty.data, dst.step);
+                break;
+            case CV_32F:
+                if (normalization)
+                    fcvImageGradientSobelPlanarf32_v2(src.data, src.cols, src.rows, src.step, (float32_t*)dst.data,
+                        (float32_t*)dsty.data, dst.step);
+                else
+                    fcvImageGradientSobelPlanarf32_v3(src.data, src.cols, src.rows, src.step, (float32_t*)dst.data,
+                        (float32_t*)dsty.data, dst.step);
+                break;
+            default:
+                CV_Error(cv::Error::StsBadArg, cv::format("depth: %d is not supported", ddepth));
+                break;
+        }
+    }
+    else
+    {
+        fcvFilterSobel3x3u8_v2(src.data, src.cols, src.rows, src.step, dst.data, dst.step);
+    }
+}
+
+void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArray _dy, int kernel_size, int borderType, int borderValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    Size size = _src.size();
+    _dx.create( size, CV_16SC1);
+    _dy.create( size, CV_16SC1);
+
+    Mat src = _src.getMat();
+    Mat dx = _dx.getMat();
+    Mat dy = _dy.getMat();
+    fcvStatus status = FASTCV_SUCCESS;
+
+    fcvBorderType   fcvBorder;
+
+    switch (borderType)
+    {
+        case cv::BorderTypes::BORDER_CONSTANT:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_CONSTANT;
+            break;
+        }
+        case cv::BorderTypes::BORDER_REPLICATE:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_REPLICATE;
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Border type: %d is not supported", borderType));
+           break;
+        }
+    }
+
+    switch (kernel_size)
+    {
+        case 3:
+            status = fcvFilterSobel3x3u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        case 5:
+            status = fcvFilterSobel5x5u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        case 7:
+            status = fcvFilterSobel7x7u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        default:
+            CV_Error(cv::Error::StsBadArg, cv::format("Kernel size %d is not supported", kernel_size));
+            break;
+    }
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/ipptransform.cpp b/modules/fastcv/src/ipptransform.cpp
new file mode 100644
index 00000000000..d5bfb259074
--- /dev/null
+++ b/modules/fastcv/src/ipptransform.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void DCT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_16SC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvDCTu8(src.data, src.cols, src.rows, src.step, (short*)dst.data, dst.step);
+}
+
+void IDCT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_16SC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_8UC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvIDCTs16((const short*)src.data, src.cols, src.rows, src.step, dst.data, dst.step);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/mser.cpp b/modules/fastcv/src/mser.cpp
index ae8519313be..a44cecae073 100644
--- a/modules/fastcv/src/mser.cpp
+++ b/modules/fastcv/src/mser.cpp
@@ -8,56 +8,109 @@
 namespace cv {
 namespace fastcv {
 
-static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                    std::vector<ContourData>& contourData,
-                    bool useBoundingBoxes = true,
-                    bool useContourData = true,
-                    unsigned int numNeighbors = 4,
-                    unsigned int delta = 2,
-                    unsigned int minArea = 30,
-                    unsigned int maxArea = 14400,
-                    float        maxVariation = 0.15f,
-                    float        minDiversity = 0.2f)
+class MSER_Impl CV_FINAL : public cv::fastcv::FCVMSER
 {
-    INITIALIZATION_CHECK;
+public:
+    explicit MSER_Impl(cv::Size     imgSize,
+                       int numNeighbors,
+                       int delta,
+                       int minArea,
+                       int maxArea,
+                       float maxVariation,
+                       float minDiversity);
 
-    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
-    CV_Assert(_src.cols() > 50);
-    CV_Assert(_src.rows() > 5);
+    ~MSER_Impl() CV_OVERRIDE;
 
-    Mat src = _src.getMat();
+    cv::Size getImgSize()      CV_OVERRIDE { return imgSize;      };
+    int getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
+    int getDelta()        CV_OVERRIDE { return delta;        };
+    int getMinArea()      CV_OVERRIDE { return minArea;      };
+    int getMaxArea()      CV_OVERRIDE { return maxArea;      };
+    float getMaxVariation() CV_OVERRIDE { return maxVariation; };
+    float getMinDiversity() CV_OVERRIDE { return minDiversity; };
 
-    CV_Assert(numNeighbors == 4 || numNeighbors == 8);
-    bool useNN4 = (numNeighbors == 4);
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours) CV_OVERRIDE;
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) CV_OVERRIDE;
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                                std::vector<ContourData>& contourData) CV_OVERRIDE;
 
-    bool usePointsArray = !useNN4;
+    void detectRegions(InputArray src,
+                       std::vector<std::vector<Point>>& contours,
+                       std::vector<cv::Rect>& boundingBoxes,
+                       std::vector<ContourData>& contourData,
+                       bool useBoundingBoxes = true,
+                       bool useContourData = true);
+
+    cv::Size imgSize;
+    int numNeighbors;
+    int delta;
+    int minArea;
+    int maxArea;
+    float maxVariation;
+    float minDiversity;
 
     void *mserHandle;
+};
 
-    bool isInitOk = false;
-    if (useNN4)
-    {
-        isInitOk = fcvMserInit(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
-    }
-    else
-    {
-        isInitOk = fcvMserNN8Init(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
-    }
 
-    if (!isInitOk)
+MSER_Impl::MSER_Impl(cv::Size _imgSize,
+                     int _numNeighbors,
+                     int _delta,
+                     int _minArea,
+                     int _maxArea,
+                     float _maxVariation,
+                     float _minDiversity)
+{
+    CV_Assert(_imgSize.width > 50);
+    CV_Assert(_imgSize.height > 5);
+
+    CV_Assert(_numNeighbors == 4 || _numNeighbors == 8);
+
+    INITIALIZATION_CHECK;
+
+    this->imgSize       = _imgSize;
+    this->numNeighbors  = _numNeighbors;
+    this->delta         = _delta;
+    this->minArea       = _minArea;
+    this->maxArea       = _maxArea;
+    this->maxVariation  = _maxVariation;
+    this->minDiversity  = _minDiversity;
+
+    auto initFunc = (this->numNeighbors == 4) ? fcvMserInit : fcvMserNN8Init;
+
+    if (!initFunc(this->imgSize.width, this->imgSize.height, this->delta, this->minArea, this->maxArea,
+                  this->maxVariation, this->minDiversity, &this->mserHandle))
     {
         CV_Error(cv::Error::StsInternal, "Failed to initialize MSER");
     }
+}
+
+
+MSER_Impl::~MSER_Impl()
+{
+    fcvMserRelease(mserHandle);
+}
+
+
+void MSER_Impl::detectRegions(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                              std::vector<ContourData>& contourData, bool useBoundingBoxes, bool useContourData)
+{
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.size() == this->imgSize);
+
+    Mat src = _src.getMat();
+
+    bool usePointsArray = (this->numNeighbors == 8);
 
     //bufSize for pts and bboxes
-    const unsigned int maxContours = 16384;
-    unsigned int numContours;
+    const uint32_t maxContours = 16384;
+    uint32_t numContours;
     std::vector<uint32_t> numPointsInContour(maxContours);
 
     std::vector<uint16_t> rectArray;
     rectArray.resize(4 * maxContours); // xMin, xMax, yMax, yMin
 
-    unsigned int pointsArraySize = src.total() * 30; // Recommended typical size
+    uint32_t pointsArraySize = src.total() * 30; // Recommended typical size
     std::vector<uint16_t> pointsArray;
     std::vector<uint32_t> contourStartingPoints;
     uint32_t pathArraySize = src.total() * 4; // Recommended size
@@ -76,7 +129,7 @@ static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours,
     std::vector<int8_t> contourPolarity(maxContours);
 
     int mserRetcode = -1;
-    if (useNN4)
+    if (this->numNeighbors == 4)
     {
         mserRetcode = fcvMserExtu8_v3(mserHandle, src.data, src.cols, src.rows, src.step,
                                       maxContours, &numContours,
@@ -170,32 +223,37 @@ static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours,
             contourData.push_back(data);
         }
     }
-
-    fcvMserRelease(mserHandle);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>> &contours,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>> &contours)
 {
     std::vector<cv::Rect> boundingBoxes;
     std::vector<ContourData> contourData;
-    runMSER(_src, contours, boundingBoxes, contourData, false, false, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ false, /*useContourData*/ false);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes)
 {
     std::vector<ContourData> contourData;
-    runMSER(_src, contours, boundingBoxes, contourData, true, false, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ false);
+}
+
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                       std::vector<ContourData>& contourData)
+{
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ true);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes, std::vector<ContourData>& contourData,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+Ptr<FCVMSER> FCVMSER::create(const cv::Size& imgSize,
+                             int numNeighbors,
+                             int delta,
+                             int minArea,
+                             int maxArea,
+                             float maxVariation,
+                             float minDiversity)
 {
-    runMSER(_src, contours, boundingBoxes, contourData, true, true, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    CV_Assert(numNeighbors > 0 && delta >= 0 && minArea >= 0 && maxArea >= 0);
+    return makePtr<MSER_Impl>(imgSize, numNeighbors, delta, minArea, maxArea, maxVariation, minDiversity);
 }
 
 } // fastcv::
diff --git a/modules/fastcv/src/precomp.hpp b/modules/fastcv/src/precomp.hpp
index d33cb25bafb..c2929d76cc1 100644
--- a/modules/fastcv/src/precomp.hpp
+++ b/modules/fastcv/src/precomp.hpp
@@ -28,6 +28,9 @@ namespace fastcv {
     CV_INSTRUMENT_REGION();                                                 \
 }
 
+#define FCV_KernelSize_SHIFT 3
+#define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
+
 const std::map<fcvStatus, std::string> fcvStatusStrings =
 {
     { FASTCV_SUCCESS,       "Success"},
diff --git a/modules/fastcv/src/pyramid.cpp b/modules/fastcv/src/pyramid.cpp
new file mode 100644
index 00000000000..806c8e9970f
--- /dev/null
+++ b/modules/fastcv/src/pyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void sobelPyramid(InputArrayOfArrays _pyr, OutputArrayOfArrays _dx, OutputArrayOfArrays _dy, int outType)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(_pyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dx.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dx.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dx.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dy.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dy.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dy.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+    std::vector<cv::Mat> pyr;
+    _pyr.getMatVector(pyr);
+    size_t nLevels = pyr.size();
+
+    CV_Assert(!pyr.empty());
+
+    // this should be smaller I guess
+    CV_Assert(nLevels > 0 && nLevels < 16);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        // fcvPyramidLeved does not support other cases
+        CV_Assert(pyr[i].isContinuous());
+        CV_Assert(pyr[i].type() == CV_8UC1);
+    }
+
+    CV_Assert(outType == CV_8S || outType == CV_16S || outType == CV_32F);
+
+    std::vector<fcvPyramidLevel> lpyr;
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        fcvPyramidLevel lev;
+        lev.width  = pyr[i].cols;
+        lev.height = pyr[i].rows;
+        lev.ptr    = pyr[i].data;
+        lpyr.push_back(lev);
+    }
+
+    std::vector<fcvPyramidLevel> ldx(nLevels), ldy(nLevels);
+    int pyrElemSz = (outType == CV_8S ) ? 1 :
+                    (outType == CV_16S) ? 2 :
+                    (outType == CV_32F) ? 4 : 0;
+    int retCodex = fcvPyramidAllocate(ldx.data(), pyr[0].cols, pyr[0].rows, pyrElemSz, nLevels, 1);
+    if (retCodex != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("fcvPyramidAllocate returned code %d", retCodex));
+    }
+    int retCodey = fcvPyramidAllocate(ldy.data(), pyr[0].cols, pyr[0].rows, pyrElemSz, nLevels, 1);
+    if (retCodey != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("fcvPyramidAllocate returned code %d", retCodey));
+    }
+
+    int returnCode = -1;
+    switch (outType)
+    {
+    case CV_8S:  returnCode = fcvPyramidSobelGradientCreatei8 (lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    case CV_16S: returnCode = fcvPyramidSobelGradientCreatei16(lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    case CV_32F: returnCode = fcvPyramidSobelGradientCreatef32(lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    default:
+        break;
+    }
+
+    if (returnCode != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("FastCV returned code %d", returnCode));
+    }
+
+    // resize arrays of Mats
+    _dx.create(1, nLevels, /* type does not matter here */ -1, -1);
+    _dy.create(1, nLevels, /* type does not matter here */ -1, -1);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        cv::Mat dx((int)ldx[i].height, (int)ldx[i].width, outType, (uchar*)ldx[i].ptr);
+        _dx.create(pyr[i].size(), outType, i);
+        dx.copyTo(_dx.getMat(i));
+
+        cv::Mat dy((int)ldy[i].height, (int)ldy[i].width, outType, (uchar*)ldy[i].ptr);
+        _dy.create(pyr[i].size(), outType, i);
+        dy.copyTo(_dy.getMat(i));
+    }
+
+    fcvPyramidDelete(ldx.data(), nLevels, 0);
+    fcvPyramidDelete(ldy.data(), nLevels, 0);
+}
+
+
+void buildPyramid(InputArray _src, OutputArrayOfArrays _pyr, int nLevels, bool scaleBy2, int borderType, uint8_t borderValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && (_src.type() == CV_8UC1 || _src.type() == CV_32FC1));
+    CV_Assert(_src.step() % 8 == 0);
+
+    cv::Mat src = _src.getMat();
+    bool useFloat = src.depth() == CV_32F;
+    int bytesPerPixel = useFloat ? 4 : 1;
+
+    CV_Assert(_pyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+    // this should be smaller I guess
+    CV_Assert(nLevels > 0 && nLevels < 16);
+
+    if (useFloat && !scaleBy2)
+    {
+        CV_Error( cv::Error::StsBadArg, "ORB scale is not supported for float images (fcvPyramidCreatef32_v2)");
+    }
+
+    fcvPyramidScale scaleOption = scaleBy2 ? FASTCV_PYRAMID_SCALE_HALF : FASTCV_PYRAMID_SCALE_ORB;
+    fcvBorderType borderOption;
+    switch (borderType)
+    {
+    case cv::BORDER_REFLECT:     borderOption = FASTCV_BORDER_REFLECT;    break;
+    case cv::BORDER_REFLECT_101: borderOption = FASTCV_BORDER_REFLECT_V2; break;
+    case cv::BORDER_REPLICATE:   borderOption = FASTCV_BORDER_REPLICATE;  break;
+    default:                     borderOption = FASTCV_BORDER_UNDEFINED;  break;
+    }
+
+    std::vector<fcvPyramidLevel_v2> lpyrSrc2(nLevels);
+
+    int alignment = 8;
+    if (useFloat)
+    {
+        // use version 2
+        CV_Assert(fcvPyramidAllocate_v2(lpyrSrc2.data(), src.cols, src.rows, src.step, bytesPerPixel, nLevels, 0) == 0);
+        CV_Assert(fcvPyramidCreatef32_v2((const float*)src.data, src.cols, src.rows, src.step, nLevels, lpyrSrc2.data()) == 0);
+    }
+    else
+    {
+        // use version 4
+        fcvStatus statusAlloc = fcvPyramidAllocate_v3(lpyrSrc2.data(), src.cols, src.rows, src.step,
+                                                      bytesPerPixel, alignment, nLevels, scaleOption, 0);
+        if (statusAlloc != FASTCV_SUCCESS)
+        {
+            std::string s = fcvStatusStrings.count(statusAlloc) ? fcvStatusStrings.at(statusAlloc) : "unknown";
+            CV_Error( cv::Error::StsInternal, "fcvPyramidAllocate_v3 error: " + s);
+        }
+
+        fcvStatus statusPyr = fcvPyramidCreateu8_v4(src.data, src.cols, src.rows, src.step, nLevels, scaleOption,
+                                                    lpyrSrc2.data(), borderOption, borderValue);
+        if (statusPyr != FASTCV_SUCCESS)
+        {
+            std::string s = fcvStatusStrings.count(statusPyr) ? fcvStatusStrings.at(statusPyr) : "unknown";
+            CV_Error( cv::Error::StsInternal, "fcvPyramidCreateu8_v4 error: " + s);
+        }
+    }
+
+    // create vector
+    _pyr.create(nLevels, 1, src.type(), -1);
+    for (int i = 0; i < nLevels; i++)
+    {
+        cv::Mat m = cv::Mat((uint32_t)lpyrSrc2[i].height, (uint32_t)lpyrSrc2[i].width,
+                             src.type(), (void*)lpyrSrc2[i].ptr, (size_t)lpyrSrc2[i].stride);
+
+        _pyr.create(m.size(), m.type(), i);
+        m.copyTo(_pyr.getMat(i));
+    }
+
+    fcvPyramidDelete_v2(lpyrSrc2.data(), nLevels, 1);
+}
+
+} // namespace fastcv
+} // namespace cv
diff --git a/modules/fastcv/src/remap.cpp b/modules/fastcv/src/remap.cpp
index a0b4849ac72..933bfdc4273 100644
--- a/modules/fastcv/src/remap.cpp
+++ b/modules/fastcv/src/remap.cpp
@@ -10,8 +10,8 @@ namespace fastcv {
 
 class RemapParallel : public cv::ParallelLoopBody {
 public:
-    RemapParallel(int src_type, const uint8_t* src, unsigned int srcWidth, unsigned int srcHeight, unsigned int srcStride, uint8_t* dst,
-                unsigned int dstWidth, unsigned int dstHeight, unsigned int dstStride, const float32_t* __restrict  mapX,
+    RemapParallel(int src_type, const uint8_t* src, uint32_t srcWidth, uint32_t srcHeight, uint32_t srcStride, uint8_t* dst,
+                uint32_t dstWidth, uint32_t dstHeight, uint32_t dstStride, const float32_t* __restrict  mapX,
                 const float32_t* __restrict mapY, uint32_t mapStride, fcvInterpolationType interpolation, uint8_t borderValue)
                 : src_type_(src_type), src_(src), srcWidth_(srcWidth), srcHeight_(srcHeight), srcStride_(srcStride), dst_(dst), dstWidth_(dstWidth),
                 dstHeight_(dstHeight), dstStride_(dstStride), mapX_(mapX), mapY_(mapY), mapStride_(mapStride),
@@ -43,7 +43,7 @@ class RemapParallel : public cv::ParallelLoopBody {
 
         if(status!=FASTCV_SUCCESS)
         {
-			std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+            std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
             CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
         }
     }
@@ -51,16 +51,16 @@ class RemapParallel : public cv::ParallelLoopBody {
 private:
     int src_type_;
     const uint8_t* src_;
-    unsigned int srcWidth_;
-    unsigned int srcHeight_;
-    unsigned int srcStride_;
+    uint32_t srcWidth_;
+    uint32_t srcHeight_;
+    uint32_t srcStride_;
     uint8_t* dst_;
-    unsigned int dstWidth_;
-    unsigned int dstHeight_;
-    unsigned int dstStride_;
+    uint32_t dstWidth_;
+    uint32_t dstHeight_;
+    uint32_t dstStride_;
     const float32_t* __restrict mapX_;
     const float32_t* __restrict mapY_;
-    unsigned int mapStride_;
+    uint32_t mapStride_;
     fcvInterpolationType fcvInterpolation_;
     uint8_t borderValue_;
 };
diff --git a/modules/fastcv/src/tracking.cpp b/modules/fastcv/src/tracking.cpp
new file mode 100644
index 00000000000..778c73c323e
--- /dev/null
+++ b/modules/fastcv/src/tracking.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+static void trackOpticalFlowLKInternal(InputArray _src, InputArray _dst,
+                                       InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                                       InputArrayOfArrays _srcDxPyr, InputArrayOfArrays _srcDyPyr,
+                                       InputArray _ptsIn, OutputArray _ptsOut, InputArray _ptsEst,
+                                       OutputArray _statusVec, cv::Size winSize,
+                                       cv::TermCriteria termCriteria)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(winSize.width % 2 == 1 && winSize.height % 2 == 1);
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(!_dst.empty() && _dst.type() == CV_8UC1);
+    CV_Assert(_src.size() == _dst.size());
+    CV_Assert(_src.step() % 8 == 0);
+    CV_Assert(_dst.step() == _src.step());
+
+    cv::Mat src = _src.getMat(), dst = _dst.getMat();
+
+    CV_Assert(_srcPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _srcPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _srcPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dstPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dstPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dstPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_srcPyr.size() == _dstPyr.size());
+
+    int nLevels = _srcPyr.size().area();
+
+    std::vector<cv::Mat> srcPyr, dstPyr;
+    _srcPyr.getMatVector(srcPyr);
+    _dstPyr.getMatVector(dstPyr);
+
+    cv::Size imSz = src.size();
+    for (int i = 0; i < nLevels; i++)
+    {
+        const cv::Mat& s = srcPyr[i];
+        const cv::Mat& d = dstPyr[i];
+
+        CV_Assert(!s.empty() && s.type() == CV_8UC1);
+        CV_Assert(!d.empty() && d.type() == CV_8UC1);
+        CV_Assert(s.size() == imSz);
+        CV_Assert(d.size() == imSz);
+
+        imSz.width /= 2; imSz.height /= 2;
+    }
+
+    bool useDxDy = !_srcDxPyr.empty() && !_srcDyPyr.empty();
+    int version = useDxDy ? 1 : 3;
+
+    std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+    if (version == 1)
+    {
+        CV_Assert(_srcDxPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+                  _srcDxPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+                  _srcDxPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+        CV_Assert(_srcDyPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+                  _srcDyPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+                  _srcDyPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+        CV_Assert(_srcDxPyr.size() == _srcDyPyr.size());
+        _srcDxPyr.getMatVector(srcDxPyr);
+        _srcDyPyr.getMatVector(srcDyPyr);
+
+        imSz = src.size();
+        for (int i = 0; i < nLevels; i++)
+        {
+            const cv::Mat& dx = srcDxPyr[i];
+            const cv::Mat& dy = srcDyPyr[i];
+
+            CV_Assert(!dx.empty() && dx.type() == CV_8SC1);
+            CV_Assert(!dy.empty() && dy.type() == CV_8SC1);
+            CV_Assert(dx.size() == imSz);
+            CV_Assert(dy.size() == imSz);
+
+            imSz.width /= 2; imSz.height /= 2;
+        }
+    }
+
+    std::vector<fcvPyramidLevel> lpyrSrc1, lpyrDst1, lpyrDxSrc, lpyrDySrc;
+    std::vector<fcvPyramidLevel_v2> lpyrSrc2, lpyrDst2;
+    for (int i = 0; i < nLevels; i++)
+    {
+        fcvPyramidLevel lsrc1, ldst1;
+        fcvPyramidLevel_v2 lsrc2, ldst2;
+        lsrc1.width  = srcPyr[i].cols;
+        lsrc1.height = srcPyr[i].rows;
+        lsrc1.ptr    = srcPyr[i].data;
+
+        lsrc2.width  = srcPyr[i].cols;
+        lsrc2.height = srcPyr[i].rows;
+        lsrc2.stride = srcPyr[i].step;
+        lsrc2.ptr    = srcPyr[i].data;
+
+        ldst1.width  = dstPyr[i].cols;
+        ldst1.height = dstPyr[i].rows;
+        ldst1.ptr    = dstPyr[i].data;
+        ldst2.width  = dstPyr[i].cols;
+        ldst2.height = dstPyr[i].rows;
+        ldst2.stride = dstPyr[i].step;
+        ldst2.ptr    = dstPyr[i].data;
+        lpyrSrc1.push_back(lsrc1); lpyrDst1.push_back(ldst1);
+        lpyrSrc2.push_back(lsrc2); lpyrDst2.push_back(ldst2);
+
+        if (version == 1)
+        {
+            fcvPyramidLevel ldx, ldy;
+            CV_Assert(srcDxPyr[i].isContinuous());
+            ldx.width  = srcDxPyr[i].cols;
+            ldx.height = srcDxPyr[i].rows;
+            ldx.ptr    = srcDxPyr[i].data;
+            CV_Assert(srcDyPyr[i].isContinuous());
+            ldy.width  = srcDyPyr[i].cols;
+            ldy.height = srcDyPyr[i].rows;
+            ldy.ptr    = srcDyPyr[i].data;
+            lpyrDxSrc.push_back(ldx); lpyrDySrc.push_back(ldy);
+        }
+    }
+
+    CV_Assert(!_ptsIn.empty() && (_ptsIn.type() == CV_32FC1 || _ptsIn.type() == CV_32FC2));
+    CV_Assert(_ptsIn.isContinuous());
+    CV_Assert(_ptsIn.total() * _ptsIn.channels() % 2 == 0);
+
+    cv::Mat ptsIn = _ptsIn.getMat();
+    int nPts = ptsIn.total() * ptsIn.channels() / 2;
+
+    bool useInitialEstimate;
+    cv::Mat ptsEst;
+    const float32_t* ptsEstData;
+    if (!_ptsEst.empty())
+    {
+        CV_Assert(_ptsEst.type() == CV_32FC1 || _ptsEst.type() == CV_32FC2);
+        CV_Assert(_ptsEst.isContinuous());
+        int estElems = _ptsEst.total() * _ptsEst.channels();
+        CV_Assert(estElems % 2 == 0);
+        CV_Assert(estElems / 2 == nPts);
+
+        ptsEst = _ptsEst.getMat();
+        ptsEstData = (const float32_t*)ptsEst.data;
+        useInitialEstimate = true;
+    }
+    else
+    {
+        useInitialEstimate = false;
+        ptsEstData = (const float32_t*)ptsIn.data;
+    }
+
+    CV_Assert(_ptsOut.needed());
+    _ptsOut.create(1, nPts, CV_32FC2);
+    cv::Mat ptsOut = _ptsOut.getMat();
+
+    cv::Mat statusVec;
+    if (!_statusVec.empty())
+    {
+        _statusVec.create(1, nPts, CV_32SC1);
+        statusVec = _statusVec.getMat();
+    }
+    else
+    {
+        statusVec = cv::Mat(1, nPts, CV_32SC1);
+    }
+
+    fcvTerminationCriteria termCrit;
+    if (termCriteria.type & cv::TermCriteria::COUNT)
+    {
+        if (termCriteria.type & cv::TermCriteria::EPS)
+        {
+            termCrit = FASTCV_TERM_CRITERIA_BOTH;
+        }
+        else
+        {
+            termCrit = FASTCV_TERM_CRITERIA_ITERATIONS;
+        }
+    }
+    else
+    {
+        if (termCriteria.type & cv::TermCriteria::EPS)
+        {
+            termCrit = FASTCV_TERM_CRITERIA_EPSILON;
+        }
+        else
+        {
+            CV_Error(cv::Error::StsBadArg, "Incorrect termination criteria");
+        }
+    }
+    int maxIterations = termCriteria.maxCount;
+    double maxEpsilon = termCriteria.epsilon;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (version == 3)
+    {
+        status = fcvTrackLKOpticalFlowu8_v3(src.data, dst.data, src.cols, src.rows, src.step,
+                                            lpyrSrc2.data(), lpyrDst2.data(),
+                                            (const float32_t*)ptsIn.data,
+                                            ptsEstData,
+                                            (float32_t*)ptsOut.data,
+                                            (int32_t*)statusVec.data,
+                                            nPts,
+                                            winSize.width, winSize.height,
+                                            nLevels,
+                                            termCrit, maxIterations, maxEpsilon,
+                                            useInitialEstimate);
+    }
+    else // if (version == 1)
+    {
+        CV_Assert(src.isContinuous() && dst.isContinuous());
+        // Obsolete parameters, set to 0
+        float maxResidue = 0, minDisplacement = 0, minEigenvalue = 0;
+        int lightingNormalized = 0;
+        fcvTrackLKOpticalFlowu8(src.data, dst.data, src.cols, src.rows,
+                                lpyrSrc1.data(), lpyrDst1.data(),
+                                lpyrDxSrc.data(), lpyrDySrc.data(),
+                                (const float32_t*)ptsIn.data,
+                                (float32_t*)ptsOut.data,
+                                (int32_t*)statusVec.data,
+                                nPts,
+                                winSize.width, winSize.height,
+                                maxIterations,
+                                nLevels,
+                                maxResidue, minDisplacement, minEigenvalue, lightingNormalized);
+    }
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+
+void trackOpticalFlowLK(InputArray _src, InputArray _dst,
+                        InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                        InputArray _ptsIn, OutputArray _ptsOut, InputArray _ptsEst,
+                        OutputArray _statusVec, cv::Size winSize,
+                        cv::TermCriteria termCriteria)
+{
+    trackOpticalFlowLKInternal(_src, _dst, _srcPyr, _dstPyr, noArray(), noArray(),
+                               _ptsIn, _ptsOut, _ptsEst,
+                               _statusVec, winSize,
+                               termCriteria);
+}
+
+void trackOpticalFlowLK(InputArray _src, InputArray _dst,
+                        InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                        InputArrayOfArrays _srcDxPyr, InputArrayOfArrays _srcDyPyr,
+                        InputArray _ptsIn, OutputArray _ptsOut,
+                        OutputArray _statusVec, cv::Size winSize, int maxIterations)
+{
+    trackOpticalFlowLKInternal(_src, _dst, _srcPyr, _dstPyr,
+                               _srcDxPyr, _srcDyPyr,
+                               _ptsIn, _ptsOut, cv::noArray(),
+                               _statusVec, winSize,
+                               {cv::TermCriteria::MAX_ITER | cv::TermCriteria::EPS,
+                                maxIterations, /* maxEpsilon */ 0.03f * 0.03f});
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/warp.cpp b/modules/fastcv/src/warp.cpp
new file mode 100644
index 00000000000..01f83bdf510
--- /dev/null
+++ b/modules/fastcv/src/warp.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvWarpPerspectiveLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvWarpPerspectiveLoop_Invoker(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2, InputArray _M0,
+        Size _dsize) : cv::ParallelLoopBody()
+    {
+        src1 = _src1.getMat();
+        src2 = _src2.getMat();
+        dsize = _dsize;
+
+        _dst1.create(dsize, src1.type());
+        _dst2.create(dsize, src2.type());
+        dst1 = _dst1.getMat();
+        dst2 = _dst2.getMat();
+
+        M = _M0.getMat();
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        uchar* dst1_ptr = dst1.data + range.start*dst1.step;
+        uchar* dst2_ptr = dst2.data + range.start*dst2.step;
+        int rangeHeight = range.end - range.start;
+
+        float rangeMatrix[9];
+        rangeMatrix[0] = M.at<float>(0,0);
+        rangeMatrix[1] = M.at<float>(0,1);
+        rangeMatrix[2] = M.at<float>(0,2)+range.start*M.at<float>(0,1);
+        rangeMatrix[3] = M.at<float>(1,0);
+        rangeMatrix[4] = M.at<float>(1,1);
+        rangeMatrix[5] = M.at<float>(1,2)+range.start*M.at<float>(1,1);
+        rangeMatrix[6] = M.at<float>(2,0);
+        rangeMatrix[7] = M.at<float>(2,1);
+        rangeMatrix[8] = M.at<float>(2,2)+range.start*M.at<float>(2,1);
+
+        fcv2PlaneWarpPerspectiveu8(src1.data, src2.data, src1.cols, src1.rows, src1.step, src2.step, dst1_ptr, dst2_ptr,
+            dsize.width, rangeHeight, dst1.step, dst2.step, rangeMatrix);
+    }
+
+    private:
+    Mat         src1;
+    Mat         src2;
+    Mat         dst1;
+    Mat         dst2;
+    Mat         M;
+    Size        dsize;
+
+    FcvWarpPerspectiveLoop_Invoker(const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+    const FcvWarpPerspectiveLoop_Invoker& operator= (const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+};
+
+void warpPerspective2Plane(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2, InputArray _M0,
+        Size dsize)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src1.empty() && _src1.type() == CV_8UC1);
+    CV_Assert(!_src2.empty() && _src2.type() == CV_8UC1);
+    CV_Assert(!_M0.empty());
+
+    cv::parallel_for_(cv::Range(0, dsize.height),
+        FcvWarpPerspectiveLoop_Invoker(_src1, _src2, _dst1, _dst2, _M0, dsize), 1);
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_bilateral.cpp b/modules/fastcv/test/test_bilateral.cpp
index 4f582c2ed37..5c883801a92 100644
--- a/modules/fastcv/test/test_bilateral.cpp
+++ b/modules/fastcv/test/test_bilateral.cpp
@@ -10,20 +10,20 @@ namespace opencv_test { namespace {
 typedef testing::TestWithParam<tuple<cv::Size,int,int>> fcv_bilateralFilterTest;
 
 TEST_P(fcv_bilateralFilterTest, accuracy)
-{	
+{
     cv::Size size  = get<0>(GetParam());
 	int d = get<1>(GetParam());
     double sigmaColor = get<2>(GetParam());
-	double sigmaSpace = sigmaColor;
-	
-	RNG& rng = cv::theRNG();
+    double sigmaSpace = sigmaColor;
+
+    RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
 
     cv::Mat dst;
 
-	cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
-	
+    cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+
     EXPECT_FALSE(dst.empty());
 }
 
diff --git a/modules/fastcv/test/test_blur.cpp b/modules/fastcv/test/test_blur.cpp
new file mode 100644
index 00000000000..1dde0261f28
--- /dev/null
+++ b/modules/fastcv/test/test_blur.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<Size, int, int, bool>> GaussianBlurTest;
+
+TEST_P(GaussianBlurTest, accuracy)
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+    bool border = get<3>(GetParam());
+
+    // For some cases FastCV not support, so skip them
+    if((ksize!=5) && (depth!=CV_8U))
+        return;
+
+    cv::Mat src(srcSize, depth);
+    cv::Mat dst,ref;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    cv::fastcv::gaussianBlur(src, dst, ksize, border);
+
+    if(depth == CV_32S)
+        src.convertTo(src, CV_32F);
+    cv::GaussianBlur(src,ref,Size(ksize,ksize),0,0,border);
+    ref.convertTo(ref,depth);
+
+    cv::Mat difference;
+    cv::absdiff(dst, ref, difference);
+
+    int num_diff_pixels = cv::countNonZero(difference);
+
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+typedef testing::TestWithParam<tuple<Size, int, int>> Filter2DTest;
+
+TEST_P(Filter2DTest, accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth   = get<1>(GetParam());
+    int ksize    = get<2>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel;
+    cv::Mat dst, ref;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            kernel.create(ksize,ksize,CV_8S);
+            break;
+        }
+        case CV_32F:
+        {
+            kernel.create(ksize,ksize,CV_32F);
+            break;
+        }
+        default:
+            return;
+    }
+
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    cv::fastcv::filter2D(src, dst, ddepth, kernel);
+    cv::filter2D(src, ref, ddepth, kernel);
+
+    cv::Mat difference;
+    dst.convertTo(dst, CV_8U);
+    ref.convertTo(ref, CV_8U);
+    cv::absdiff(dst, ref, difference);
+
+    int num_diff_pixels = cv::countNonZero(difference);
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+typedef testing::TestWithParam<tuple<Size, int>> SepFilter2DTest;
+
+TEST_P(SepFilter2DTest, accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize    = get<1>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel(1,ksize,CV_8S);
+    cv::Mat dst,ref;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    cv::fastcv::sepFilter2D(src, dst, CV_8U, kernel, kernel);
+    cv::sepFilter2D(src,ref,CV_8U,kernel,kernel);
+
+    cv::Mat difference;
+    cv::absdiff(dst, ref, difference);
+    int num_diff_pixels = cv::countNonZero(difference);
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, GaussianBlurTest, Combine(
+/*image size*/     ::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*image depth*/    ::testing::Values(CV_8U,CV_16S,CV_32S),
+/*kernel size*/    ::testing::Values(3, 5),
+/*blur border*/    ::testing::Values(true,false)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Filter2DTest, Combine(
+/*image sie*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*dst depth*/      Values(CV_8U,CV_16S,CV_32F),
+/*kernel size*/    Values(3, 5, 7, 9, 11)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, SepFilter2DTest, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*kernel size*/    Values(3, 5, 7, 9, 11)
+));
+
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_edges.cpp b/modules/fastcv/test/test_edges.cpp
new file mode 100644
index 00000000000..e1e1576ef15
--- /dev/null
+++ b/modules/fastcv/test/test_edges.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<Size, int, int, int>> Sobel;
+typedef testing::TestWithParam<tuple<Size, int>> Sobel3x3u8;
+
+TEST_P(Sobel,accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize = get<1>(GetParam());
+    int border = get<2>(GetParam());
+    int borderValue = get<3>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U), refx, refy;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cv::fastcv::sobel(src, dx, dy, ksize, border, borderValue);
+
+    cv::Sobel(src, refx, CV_16S, 1, 0, ksize, 1.0, 0.0, border);
+    cv::Sobel(src, refy, CV_16S, 0, 1, ksize, 1.0, 0.0, border);
+
+    cv::Mat difference_x, difference_y;
+    cv::absdiff(dx, refx, difference_x);
+    cv::absdiff(dy, refy, difference_y);
+
+    int num_diff_pixels_x = cv::countNonZero(difference_x);
+    int num_diff_pixels_y = cv::countNonZero(difference_y);
+    EXPECT_LT(num_diff_pixels_x, src.size().area()*0.1);
+    EXPECT_LT(num_diff_pixels_y, src.size().area()*0.1);
+}
+
+TEST_P(Sobel3x3u8,accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U), refx, refy;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    cv::fastcv::sobel3x3u8(src, dx, dy, ddepth, 0);
+    cv::Sobel(src, refx, ddepth, 1, 0);
+    cv::Sobel(src, refy, ddepth, 0, 1);
+
+    cv::Mat difference_x, difference_y;
+    cv::absdiff(dx, refx, difference_x);
+    cv::absdiff(dy, refy, difference_y);
+
+    int num_diff_pixels_x = cv::countNonZero(difference_x);
+    int num_diff_pixels_y = cv::countNonZero(difference_y);
+    EXPECT_LT(num_diff_pixels_x, src.size().area()*0.1);
+    EXPECT_LT(num_diff_pixels_y, src.size().area()*0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Sobel, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*kernel size*/     Values(3,5,7),
+/*border*/          Values(BORDER_CONSTANT, BORDER_REPLICATE),
+/*border value*/    Values(0)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Sobel3x3u8, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*dst depth*/       Values(CV_16S, CV_32F)
+));
+
+}
+}
diff --git a/modules/fastcv/test/test_fft.cpp b/modules/fastcv/test/test_fft.cpp
index 18b53d88ba0..ef70f8e12f5 100644
--- a/modules/fastcv/test/test_fft.cpp
+++ b/modules/fastcv/test/test_fft.cpp
@@ -39,7 +39,6 @@ TEST_P(FFTExtTest, inverse)
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
-    //cv::Mat src = imread(cvtest::findDataFile("cv/shared/lena.png"), IMREAD_GRAYSCALE);
 
     Mat srcFloat;
     src.convertTo(srcFloat, CV_32F);
diff --git a/modules/fastcv/test/test_ipptransform.cpp b/modules/fastcv/test/test_ipptransform.cpp
new file mode 100644
index 00000000000..66ff8cbd59d
--- /dev/null
+++ b/modules/fastcv/test/test_ipptransform.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+class DCTExtTest : public ::testing::TestWithParam<cv::Size> {};
+
+TEST_P(DCTExtTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat dst, ref;
+    cv::fastcv::DCT(src, dst);
+
+    cv::dct(srcFloat, ref);
+
+    Mat dstFloat;
+    ref.convertTo(dstFloat, CV_32F);
+
+    double normInf = cvtest::norm(dstFloat, ref, cv::NORM_INF);
+    double normL2  = cvtest::norm(dstFloat, ref, cv::NORM_L2)  / dst.size().area();
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << "dst:" << std::endl << dst << std::endl;
+        std::cout << "ref:" << std::endl << ref << std::endl;
+    }
+
+    EXPECT_EQ(normInf, 0);
+    EXPECT_EQ(normL2, 0);
+}
+
+TEST_P(DCTExtTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat fwd, back;
+    cv::fastcv::DCT(src, fwd);
+    cv::fastcv::IDCT(fwd, back);
+    Mat backFloat;
+    back.convertTo(backFloat, CV_32F);
+
+    Mat fwdRef, backRef;
+    cv::dct(srcFloat, fwdRef);
+    cv::idct(fwdRef, backRef);
+
+    double normInf = cvtest::norm(backFloat, backRef, cv::NORM_INF);
+    double normL2  = cvtest::norm(backFloat, backRef, cv::NORM_L2)  / src.size().area();
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << "src:"     << std::endl << src     << std::endl;
+        std::cout << "back:"    << std::endl << back    << std::endl;
+        std::cout << "backRef:" << std::endl << backRef << std::endl;
+    }
+
+    EXPECT_LE(normInf, 7.00005);
+    EXPECT_LT(normL2,  0.13);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, DCTExtTest, ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512)));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_moments.cpp b/modules/fastcv/test/test_moments.cpp
index 1d23156dcf2..d4ef89f98db 100644
--- a/modules/fastcv/test/test_moments.cpp
+++ b/modules/fastcv/test/test_moments.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/moments.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -37,7 +36,7 @@ TEST_P(fcv_momentsTest, accuracy)
 INSTANTIATE_TEST_CASE_P(/*nothing*/, fcv_momentsTest, Combine(
                    Values(false, true),
                    Values(TYPICAL_MAT_SIZES),
-                   Values(CV_8UC1, CV_32SC1, CV_32FC1)			   
+                   Values(CV_8UC1, CV_32SC1, CV_32FC1)
 ));
 
 }
diff --git a/modules/fastcv/test/test_mser.cpp b/modules/fastcv/test/test_mser.cpp
index ebacbad32f3..d3cb35bf47e 100644
--- a/modules/fastcv/test/test_mser.cpp
+++ b/modules/fastcv/test/test_mser.cpp
@@ -23,32 +23,32 @@ TEST_P(MSERTest, accuracy)
 
     cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
 
-    unsigned int delta = 2;
-    unsigned int minArea = 256;
-    unsigned int maxArea = (int)src.total()/4;
+    uint32_t delta = 2;
+    uint32_t minArea = 256;
+    uint32_t maxArea = (int)src.total()/4;
     float        maxVariation = 0.15f;
     float        minDiversity = 0.2f;
 
     std::vector<std::vector<Point>> contours;
     std::vector<cv::Rect> bboxes;
-    std::vector<cv::fastcv::ContourData> contourData;
+    std::vector<cv::fastcv::FCVMSER::ContourData> contourData;
+    cv::Ptr<cv::fastcv::FCVMSER> mser;
+    mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+                                    maxVariation, minDiversity);
     if (useBboxes)
     {
         if (useContourData)
         {
-            cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours, bboxes, contourData);
         }
         else
         {
-            cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours, bboxes);
         }
     }
     else
     {
-        cv::fastcv::MSER(src, contours, numNeighbors,
-                         delta, minArea, maxArea, maxVariation, minDiversity);
+        mser->detect(src, contours);
     }
 
     Rect imgRect(0, 0, src.cols, src.rows);
@@ -175,4 +175,4 @@ INSTANTIATE_TEST_CASE_P(FastCV_Extension, MSERTest,
                        ::testing::Values("cv/shared/baboon.png", "cv/mser/puzzle.png")
                       )
     );
-}} // namespaces opencv_test, ::
\ No newline at end of file
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_precomp.hpp b/modules/fastcv/test/test_precomp.hpp
index 1b4c23eca30..7ff8ed78049 100644
--- a/modules/fastcv/test/test_precomp.hpp
+++ b/modules/fastcv/test/test_precomp.hpp
@@ -4,6 +4,7 @@
 */
 
 #include <opencv2/ts.hpp>
+#include <opencv2/core/affine.hpp>
 #include <opencv2/features2d.hpp>
 #include <opencv2/video.hpp>
 
diff --git a/modules/fastcv/test/test_pyramid.cpp b/modules/fastcv/test/test_pyramid.cpp
new file mode 100644
index 00000000000..29acf9ab9a7
--- /dev/null
+++ b/modules/fastcv/test/test_pyramid.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<bool /*useFloat*/, int /*nLevels*/, bool /*scaleBy2*/> PyramidTestParams;
+class PyramidTest : public ::testing::TestWithParam<PyramidTestParams> { };
+
+TEST_P(PyramidTest, accuracy)
+{
+    auto par = GetParam();
+
+    bool useFloat = std::get<0>(par);
+    int  nLevels  = std::get<1>(par);
+    bool scaleBy2 = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    if (useFloat)
+    {
+        cv::Mat f;
+        src.convertTo(f, CV_32F);
+        src = f;
+    }
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels, scaleBy2);
+
+    ASSERT_EQ(pyr.size(), (size_t)nLevels);
+
+    std::vector<cv::Mat> refPyr;
+    if (scaleBy2)
+    {
+        cv::buildPyramid(src, refPyr, nLevels - 1);
+    }
+    else // ORB downscaling
+    {
+        for (int i = 0; i < nLevels; i++)
+        {
+            // we don't know how exactly the bit-accurate size is calculated
+            cv::Mat level;
+            cv::resize(src, level, pyr[i].size(), 0, 0, cv::INTER_AREA);
+            refPyr.push_back(level);
+        }
+    }
+
+    for (int i = 0; i < nLevels; i++)
+    {
+        cv::Mat ref = refPyr[i];
+        cv::Mat m = pyr[i];
+        ASSERT_EQ(m.size(), ref.size());
+        double l2diff   = cv::norm(m, ref, cv::NORM_L2);
+        double linfdiff = cv::norm(m, ref, cv::NORM_INF);
+
+        double l2Thresh   = scaleBy2 ? 178.0 : 5216.0;
+        double linfThresh = scaleBy2 ?  16.0 :  116.0;
+        EXPECT_LE(l2diff,   l2Thresh);
+        EXPECT_LE(linfdiff, linfThresh);
+    }
+
+    if (cvtest::debugLevel > 0)
+    {
+        for (int i = 0; i < nLevels; i++)
+        {
+            char tchar = useFloat ? 'f' : 'i';
+            std::string scaleStr = scaleBy2 ? "x2" : "xORB";
+            cv::imwrite(cv::format("pyr_diff_%c_%d_%s_l%d.png", tchar, nLevels, scaleStr.c_str(), i), cv::abs(pyr[i] - refPyr[i]));
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, PyramidTest,
+                        // useFloat, nLevels, scaleBy2
+                        ::testing::Values(
+                            PyramidTestParams { true, 2,  true}, PyramidTestParams { true, 3,  true}, PyramidTestParams { true, 4,  true},
+                            PyramidTestParams {false, 2,  true}, PyramidTestParams {false, 3,  true}, PyramidTestParams {false, 4,  true},
+                            PyramidTestParams {false, 2, false}, PyramidTestParams {false, 3, false}, PyramidTestParams {false, 4, false}
+                            ));
+
+typedef std::tuple<MatType, size_t> SobelPyramidTestParams;
+class SobelPyramidTest : public ::testing::TestWithParam<SobelPyramidTestParams> {};
+
+TEST_P(SobelPyramidTest, accuracy)
+{
+    auto p = GetParam();
+    int    type    = std::get<0>(p);
+    size_t nLevels = std::get<1>(p);
+
+    // NOTE: test files should be manually loaded to folder on a device, for example like this:
+    // adb push fastcv/misc/bilateral_recursive/ /sdcard/testdata/fastcv/bilateral/
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels);
+
+    std::vector<cv::Mat> pyrDx, pyrDy;
+    cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type);
+
+    ASSERT_EQ(pyrDx.size(), nLevels);
+    ASSERT_EQ(pyrDy.size(), nLevels);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        ASSERT_EQ(pyrDx[i].type(), type);
+        ASSERT_EQ(pyrDx[i].size(), pyr[i].size());
+        ASSERT_EQ(pyrDy[i].type(), type);
+        ASSERT_EQ(pyrDy[i].size(), pyr[i].size());
+    }
+
+    std::vector<cv::Mat> refPyrDx(nLevels), refPyrDy(nLevels);
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        int stype = (type == CV_8S) ? CV_16S : type;
+        cv::Mat dx, dy;
+        cv::Sobel(pyr[i], dx, stype, 1, 0);
+        cv::Sobel(pyr[i], dy, stype, 0, 1);
+        dx.convertTo(refPyrDx[i], type, 1.0/8.0, 0.0);
+        dy.convertTo(refPyrDy[i], type, 1.0/8.0, 0.0);
+    }
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        cv::Mat ref, dst;
+        double normInf, normL2;
+        ref = refPyrDx[i];
+        dst = pyrDx[i];
+        normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+        normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
+
+        EXPECT_LE(normInf, 76.1);
+        EXPECT_LT(normL2,   0.4);
+
+        ref = refPyrDy[i];
+        dst = pyrDy[i];
+        normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+        normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
+
+        EXPECT_LE(normInf, 66.6);
+        EXPECT_LT(normL2,   0.4);
+    }
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::map<int, std::string> typeToString =
+        {
+            {CV_8U,   "8u"}, {CV_8S,   "8s"}, {CV_16U, "16u"}, {CV_16S, "16s"},
+            {CV_32S, "32s"}, {CV_32F, "32f"}, {CV_64F, "64f"}, {CV_16F, "16f"},
+        };
+
+        for (size_t i = 0; i < nLevels; i++)
+        {
+            cv::imwrite(cv::format("pyr_l%zu.png", i), pyr[i]);
+            cv::imwrite(cv::format("pyr_sobel_x_t%s_l%zu.png", typeToString.at(type).c_str(), i), pyrDx[i] + 128);
+            cv::imwrite(cv::format("pyr_sobel_y_t%s_l%zu.png", typeToString.at(type).c_str(), i), pyrDy[i] + 128);
+
+            cv::imwrite(cv::format("ref_pyr_sobel_x_t%s_l%zu.png", typeToString.at(type).c_str(), i), refPyrDx[i] + 128);
+            cv::imwrite(cv::format("ref_pyr_sobel_y_t%s_l%zu.png", typeToString.at(type).c_str(), i), refPyrDy[i] + 128);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, SobelPyramidTest, ::testing::Combine(
+    ::testing::Values(CV_8S, CV_16S, CV_32F), // depth
+    ::testing::Values(3, 6))); // nLevels
+
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_remap.cpp b/modules/fastcv/test/test_remap.cpp
index 6fa5ccdabfd..28501534a5d 100644
--- a/modules/fastcv/test/test_remap.cpp
+++ b/modules/fastcv/test/test_remap.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/remap.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -77,12 +76,8 @@ TEST_P(RemapTest, accuracy)
     cv::Mat remapOpenCV;
     cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
 
-    cv::Mat diffImage;
-    cv::absdiff(dst, remapOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(dst, remapOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -105,12 +100,8 @@ TEST_P(RemapTestRGBA, accuracy)
     cv::Mat remapOpenCV;
     cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
 
-    cv::Mat diffImage;
-    cv::absdiff(dst, remapOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(dst, remapOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
diff --git a/modules/fastcv/test/test_scale.cpp b/modules/fastcv/test/test_scale.cpp
index 394fd907cc9..b8e84218ed8 100644
--- a/modules/fastcv/test/test_scale.cpp
+++ b/modules/fastcv/test/test_scale.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/scale.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -25,12 +24,8 @@ TEST(resizeDownBy2, accuracy)
     cv::Mat resizedImageOpenCV;
     cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 2, inputImage.rows / 2), 0, 0, INTER_AREA);
 
-    cv::Mat diffImage;
-    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(resized_image, resizedImageOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -50,12 +45,8 @@ TEST(resizeDownBy4, accuracy)
     cv::Mat resizedImageOpenCV;
     cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 4, inputImage.rows / 4), 0, 0, INTER_AREA);
 
-    cv::Mat diffImage;
-    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(resized_image, resizedImageOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -79,7 +70,7 @@ TEST_P(ResizeBy2Test, ResizeBy2) {
     EXPECT_EQ(resized_image.size().height, size.height * 0.5);
 }
 
-TEST_P(ResizeBy4Test, ResizeBy2) {
+TEST_P(ResizeBy4Test, ResizeBy4) {
 
     //Size size = get<0>(GetParam());
     Size size = GetParam();
@@ -89,7 +80,7 @@ TEST_P(ResizeBy4Test, ResizeBy2) {
     Size dsize;
     cv::Mat resized_image;
 
-    // Resize the image by a factor of 2
+    // Resize the image by a factor of 4
     cv::fastcv::resizeDownBy4(inputImage, resized_image);
 
     // Check if the output size is correct
@@ -98,14 +89,14 @@ TEST_P(ResizeBy4Test, ResizeBy2) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    ResizeTests, 
-    ResizeBy2Test, 
+    ResizeTests,
+    ResizeBy2Test,
     ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
 ));
 
 INSTANTIATE_TEST_CASE_P(
-    ResizeTests, 
-    ResizeBy4Test, 
+    ResizeTests,
+    ResizeBy4Test,
     ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
 ));
 
diff --git a/modules/fastcv/test/test_smooth.cpp b/modules/fastcv/test/test_smooth.cpp
index 0b73baa5cd5..47c85152ebf 100644
--- a/modules/fastcv/test/test_smooth.cpp
+++ b/modules/fastcv/test/test_smooth.cpp
@@ -39,7 +39,14 @@ TEST_P(BilateralRecursiveTest, accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(FastCV_Extension, BilateralRecursiveTest,
-                        ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
-                                           ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f)));
+                        ::testing::Values(
+                            BilateralTestParams {0.01f, 1.00f},
+                            BilateralTestParams {0.10f, 0.01f},
+                            BilateralTestParams {1.00f, 0.01f},
+                            BilateralTestParams {1.00f, 1.00f},
+                            BilateralTestParams {5.00f, 0.01f},
+                            BilateralTestParams {5.00f, 0.10f},
+                            BilateralTestParams {5.00f, 5.00f}
+                        ));
 
 }} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_tracking.cpp b/modules/fastcv/test/test_tracking.cpp
new file mode 100644
index 00000000000..7833c71b1ec
--- /dev/null
+++ b/modules/fastcv/test/test_tracking.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<int /*winSize*/, bool /*useSobelPyramid*/, bool /*useFastCvPyramids*/, bool /*useInitialEstimate*/ > TrackingTestParams;
+class TrackingTest : public ::testing::TestWithParam<TrackingTestParams> {};
+
+TEST_P(TrackingTest, accuracy)
+{
+    auto par = GetParam();
+
+    int winSz               = std::get<0>(par);
+    bool useSobelPyramid    = std::get<1>(par);
+    bool useFastCvPyramids  = std::get<2>(par);
+    bool useInitialEstimate = std::get<3>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    double ang = 5.0 * CV_PI / 180.0;
+    cv::Matx33d tr = {
+        cos(ang), -sin(ang), 1,
+        sin(ang),  cos(ang), 2,
+               0,         0, 1
+    };
+    cv::Matx33d orig {
+        1, 0, -(double)src.cols / 2,
+        0, 1, -(double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx33d back {
+        1, 0, (double)src.cols / 2,
+        0, 1, (double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx23d trans = (back * tr * orig).get_minor<2, 3>(0, 0);
+
+    cv::Mat dst;
+    cv::warpAffine(src, dst, trans, src.size());
+
+    int nLevels = 4;
+    std::vector<cv::Mat> srcPyr, dstPyr;
+
+    if (useFastCvPyramids)
+    {
+        cv::fastcv::buildPyramid(src, srcPyr, nLevels);
+        cv::fastcv::buildPyramid(dst, dstPyr, nLevels);
+    }
+    else
+    {
+        cv::buildPyramid(src, srcPyr, nLevels - 1);
+        cv::buildPyramid(dst, dstPyr, nLevels - 1);
+    }
+
+    cv::Matx23f transf = trans;
+    int nPts = 32;
+    std::vector<cv::Point2f> ptsIn, ptsOut, ptsEst, ptsExpected;
+    for (int i = 0; i < nPts; i++)
+    {
+        cv::Point2f p { (((float)cv::theRNG())*0.5f + 0.25f) * src.cols,
+                        (((float)cv::theRNG())*0.5f + 0.25f) * src.rows };
+        ptsIn.push_back(p);
+        ptsExpected.push_back(transf * cv::Vec3f(p.x, p.y, 1.0));
+        ptsOut.push_back({ });
+        ptsEst.push_back(p);
+    }
+
+    std::vector<int32_t> statusVec(nPts);
+
+    cv::TermCriteria termCrit;
+    termCrit.type = cv::TermCriteria::COUNT | cv::TermCriteria::EPS;
+    termCrit.maxCount = 7;
+    termCrit.epsilon = 0.03f * 0.03f;
+
+    if (useSobelPyramid)
+    {
+        std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+        cv::fastcv::sobelPyramid(srcPyr, srcDxPyr, srcDyPyr, CV_8S);
+        cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, srcDxPyr, srcDyPyr,
+                                       ptsIn, ptsOut, statusVec, {winSz, winSz});
+    }
+    else
+    {
+        cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, ptsIn, ptsOut, (useInitialEstimate ? ptsEst : noArray()),
+                                        statusVec, {winSz, winSz}, termCrit);
+    }
+
+    std::vector<cv::Point2f> ocvPtsOut;
+    std::vector<uint8_t> ocvStatusVec;
+    std::vector<float> ocvErrVec;
+    cv::calcOpticalFlowPyrLK(src, dst, ptsIn, ocvPtsOut, ocvStatusVec, ocvErrVec, {winSz, winSz}, nLevels - 1, termCrit);
+
+    cv::Mat refStatusVec(nPts, 1, CV_32S, Scalar::all(1));
+    cv::Mat ocvStatusVecInt;
+    cv::Mat(ocvStatusVec).convertTo(ocvStatusVecInt, CV_32S);
+
+    double statusNormOcv = cv::norm(ocvStatusVecInt, refStatusVec, NORM_INF);
+    double statusNorm = cv::norm(cv::Mat(statusVec), refStatusVec, NORM_INF);
+
+    EXPECT_EQ(statusNormOcv, 0);
+    EXPECT_EQ(statusNorm, 0);
+
+    double diffNormOcv = cv::norm(ocvPtsOut, ptsExpected, NORM_L2);
+    double diffNorm = cv::norm(ptsOut, ptsExpected, NORM_L2);
+
+    EXPECT_LT(diffNormOcv, 31.92);
+    EXPECT_LT(diffNorm, 6.69);
+
+    if (cvtest::debugLevel > 0)
+    {
+        auto drawPts = [ptsIn, dst](const std::vector<cv::Point2f>& ptsRes, const std::string fname)
+        {
+            cv::Mat draw = dst.clone();
+            for (size_t i = 0; i < ptsIn.size(); i++)
+            {
+                cv::line(draw, ptsIn[i], ptsRes[i], Scalar::all(255));
+                cv::circle(draw, ptsIn[i], 1, Scalar::all(255));
+                cv::circle(draw, ptsRes[i], 3, Scalar::all(255));
+            }
+            cv::imwrite(fname, draw);
+        };
+
+        drawPts(ptsOut, "track_w"+std::to_string(winSz)+"_warped.png");
+        drawPts(ocvPtsOut, "track_ocv_warped.png");
+
+        std::cout << "status vec:"   << std::endl << cv::Mat(statusVec).t()   << std::endl;
+        std::cout << "status vec ocv:" << std::endl << cv::Mat(ocvStatusVec).t() << std::endl;
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, TrackingTest,
+                        ::testing::Combine(::testing::Values(5, 7, 9), // window size
+                                           ::testing::Bool(),          // useSobelPyramid
+                                           ::testing::Bool(),          // useFastCvPyramids
+                                           ::testing::Bool()           // useInitialEstimate
+                        ));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_warp.cpp b/modules/fastcv/test/test_warp.cpp
new file mode 100644
index 00000000000..240262f93ca
--- /dev/null
+++ b/modules/fastcv/test/test_warp.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<cv::Size> WarpPerspective2Plane;
+
+TEST_P(WarpPerspective2Plane, accuracy)
+{
+    cv::Size dstSize = GetParam();
+    cv::Mat img = imread(cvtest::findDataFile("cv/shared/baboon.png"));
+    Mat src(img.rows, img.cols, CV_8UC1);
+    cvtColor(img,src,cv::COLOR_BGR2GRAY);
+    cv::Mat dst1, dst2, mat, ref1, ref2;
+    mat.create(3,3,CV_32FC1);
+    dst1.create(dstSize,CV_8UC1);
+    dst2.create(dstSize,CV_8UC1);
+
+    RNG rng = RNG((uint64)-1);
+    Point2f s[4], d[4];
+
+    s[0] = Point2f(0,0);
+    d[0] = Point2f(0,0);
+    s[1] = Point2f(src.cols-1.f,0);
+    d[1] = Point2f(dst1.cols-1.f,0);
+    s[2] = Point2f(src.cols-1.f,src.rows-1.f);
+    d[2] = Point2f(dst1.cols-1.f,dst1.rows-1.f);
+    s[3] = Point2f(0,src.rows-1.f);
+    d[3] = Point2f(0,dst1.rows-1.f);
+
+    float buffer[16];
+    Mat tmp( 1, 16, CV_32FC1, buffer );
+    rng.fill( tmp, 1, Scalar::all(0.), Scalar::all(0.1) );
+
+    for(int i = 0; i < 4; i++ )
+    {
+        s[i].x += buffer[i*4]*src.cols/2;
+        s[i].y += buffer[i*4+1]*src.rows/2;
+        d[i].x += buffer[i*4+2]*dst1.cols/2;
+        d[i].y += buffer[i*4+3]*dst1.rows/2;
+    }
+
+    cv::getPerspectiveTransform( s, d ).convertTo( mat, mat.depth() );
+    // Invert the perspective matrix
+    invert(mat,mat);
+
+    cv::fastcv::warpPerspective2Plane(src, src, dst1, dst2, mat, dstSize);
+    cv::warpPerspective(src,ref1,mat,dstSize,(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP));
+    cv::warpPerspective(src,ref2,mat,dstSize,(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP));
+
+    cv::Mat difference1, difference2, mask1,mask2;
+    cv::absdiff(dst1, ref1, difference1);
+    cv::absdiff(dst2, ref2, difference2);
+    cv::threshold(difference1, mask1, 5, 255, cv::THRESH_BINARY);
+    cv::threshold(difference2, mask2, 5, 255, cv::THRESH_BINARY);
+    int num_diff_pixels_1 = cv::countNonZero(mask1);
+    int num_diff_pixels_2 = cv::countNonZero(mask2);
+
+    EXPECT_LT(num_diff_pixels_1, src.size().area()*0.02);
+    EXPECT_LT(num_diff_pixels_2, src.size().area()*0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, WarpPerspective2Plane, Values(perf::szVGA, perf::sz720p, perf::sz1080p));
+
+}
+}
\ No newline at end of file