From d21e42bf4ed26a99cdfe99d07026150db83fc6ca Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Fri, 22 Nov 2024 18:02:34 +0200
Subject: [PATCH 01/14] cudacodec: VideoReader fix yuv to color conversion
 using Nvidia Video Codec SDK sample as a guide

---
 modules/cudacodec/CMakeLists.txt              |   3 -
 .../cudacodec/include/opencv2/cudacodec.hpp   | 104 ++-
 modules/cudacodec/src/cuda/ColorSpace.cu      | 762 ++++++++++++++++++
 modules/cudacodec/src/cuda/ColorSpace.h       |  69 ++
 modules/cudacodec/src/cuda/nv12_to_rgb.cu     | 190 -----
 modules/cudacodec/src/cuda/rgb_to_yv12.cu     | 167 ----
 ...idia_surface_format_to_color_converter.cpp | 205 +++++
 modules/cudacodec/src/precomp.hpp             |   1 -
 modules/cudacodec/src/video_decoder.cpp       |  46 +-
 modules/cudacodec/src/video_decoder.hpp       |   4 +-
 modules/cudacodec/src/video_parser.cpp        |   1 +
 modules/cudacodec/src/video_reader.cpp        | 101 +--
 modules/cudacodec/test/test_video.cpp         | 426 +++++++++-
 13 files changed, 1595 insertions(+), 484 deletions(-)
 create mode 100644 modules/cudacodec/src/cuda/ColorSpace.cu
 create mode 100644 modules/cudacodec/src/cuda/ColorSpace.h
 delete mode 100644 modules/cudacodec/src/cuda/nv12_to_rgb.cu
 delete mode 100644 modules/cudacodec/src/cuda/rgb_to_yv12.cu
 create mode 100644 modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp

diff --git a/modules/cudacodec/CMakeLists.txt b/modules/cudacodec/CMakeLists.txt
index 8df41f00a96..a2dd450423f 100644
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@@ -38,9 +38,6 @@ if(HAVE_NVCUVID OR HAVE_NVCUVENC)
   endif()
   if(HAVE_NVCUVID)
     list(APPEND extra_libs ${CUDA_nvcuvid_LIBRARY})
-    if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-      list(APPEND extra_libs CUDA::nppicc${CUDA_LIB_EXT})
-    endif()
   endif()
   if(HAVE_NVCUVENC)
       if(WIN32)
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index ca0ce204447..a0c039189e9 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -93,19 +93,19 @@ enum Codec
 
 /** @brief ColorFormat for the frame returned by VideoReader::nextFrame() and VideoReader::retrieve() or used to initialize a VideoWriter.
 */
-enum class ColorFormat {
+enum ColorFormat {
     UNDEFINED = 0,
-    BGRA = 1, //!< OpenCV color format, can be used with both VideoReader and VideoWriter.
-    BGR = 2, //!< OpenCV color format, can be used with both VideoReader and VideoWriter.
-    GRAY = 3, //!< OpenCV color format, can be used with both VideoReader and VideoWriter.
-    NV_NV12 = 4, //!< Nvidia color format - equivalent to YUV - Semi-Planar YUV [Y plane followed by interleaved UV plane], can be used with both VideoReader and VideoWriter.
-
-    RGB = 5, //!< OpenCV color format, can only be used with VideoWriter.
-    RGBA = 6, //!< OpenCV color format, can only be used with VideoWriter.
-    NV_YV12 = 8, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by V and U planes], use with VideoReader, can only be used with VideoWriter.
-    NV_IYUV = 9, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes], use with VideoReader, can only be used with VideoWriter.
-    NV_YUV444 = 10, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes], use with VideoReader, can only be used with VideoWriter.
-    NV_AYUV = 11, //!< Nvidia Buffer Format - 8 bit Packed A8Y8U8V8. This is a word-ordered format where a pixel is represented by a 32-bit word with V in the lowest 8 bits, U in the next 8 bits, Y in the 8 bits after that and A in the highest 8 bits, can only be used with VideoWriter.
+    BGRA = 1, //!< OpenCV color format. VideoReader and VideoWriter.
+    BGR = 2, //!< OpenCV color format. VideoReader and VideoWriter.
+    GRAY = 3, //!< OpenCV color format. VideoReader and VideoWriter.
+    RGB = 5, //!< OpenCV color format. VideoReader and VideoWriter.
+    RGBA = 6, //!< OpenCV color format. VideoReader and VideoWriter.
+    NV_YUV_SURFACE_FORMAT = 7, //!< Nvidia YUV Surface Format output by the Nvidia decoder, see @ref SurfaceFormat. VideoReader only.
+    NV_NV12 = 4, //!< Nvidia Buffer Format - Semi-Planar YUV [Y plane followed by interleaved UV plane]. VideoWriter only. @deprecated Deprecated for use with VideoReader, use @ref NV_YUV_SURFACE_FORMAT instead.
+    NV_YV12 = 8, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by V and U planes]. VideoWriter only.
+    NV_IYUV = 9, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes]. VideoWriter only.
+    NV_YUV444 = 10, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes]. VideoWriter only.
+    NV_AYUV = 11, //!< Nvidia Buffer Format - 8 bit Packed A8Y8U8V8. This is a word-ordered format where a pixel is represented by a 32-bit word with V in the lowest 8 bits, U in the next 8 bits, Y in the 8 bits after that and A in the highest 8 bits. VideoWriter only.
 #ifndef CV_DOXYGEN
     PROP_NOT_SUPPORTED
 #endif
@@ -298,16 +298,41 @@ enum ChromaFormat
     NumFormats
 };
 
-/** @brief Deinterlacing mode used by decoder.
-* @param Weave Weave both fields (no deinterlacing). For progressive content and for content that doesn't need deinterlacing.
-* @param Bob Drop one field.
-* @param Adaptive Adaptive deinterlacing needs more video memory than other deinterlacing modes.
-* */
+/** @brief Deinterlacing mode used by decoder. */
 enum DeinterlaceMode
 {
-    Weave = 0,
-    Bob = 1,
-    Adaptive = 2
+    Weave = 0, //!< Weave both fields(no deinterlacing).For progressive content and for content that doesn't need deinterlacing.
+    Bob = 1, //!< Drop one field.
+    Adaptive = 2 //!< Adaptive deinterlacing needs more video memory than other deinterlacing modes.
+};
+
+/** @brief Video Signal Description Color Primaries of the VideoReader source (section E.2.1 VUI parameters semantics of H265 spec file) */
+enum class ColorSpaceStandard {
+    BT709 = 1, //!< ITU-R BT.709 standard for high-definition television.
+    Unspecified = 2, //!< Unspecified color space standard.
+    Reserved = 3, //!< Reserved for future use.
+    FCC = 4, //!< FCC color space standard.
+    BT470 = 5, //!< ITU - R BT.470, used for older analog television systems.
+    BT601 = 6, //!< ITU - R BT.601, used for standard definition television.
+    SMPTE240M = 7, //!< SMPTE 240M, used for early HDTV systems.
+    YCgCo = 8, //!< YCgCo color space, used in some video compression algorithms.
+    BT2020 = 9, //!< ITU - R BT.2020, used for ultra-high-definition television.
+    BT2020C = 10 //!< ITU - R BT.2020 Constant Luminance, used for ultra-high-definition television.
+};
+
+/** @brief Video surface formats output by the decoder */
+enum SurfaceFormat {
+    SF_NV12 = 0, //!< Semi-Planar YUV [Y plane followed by interleaved UV plane]
+    SF_P016 = 1, //!< 16 bit Semi-Planar YUV [Y plane followed by interleaved UV plane]. Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)
+    SF_YUV444 = 2, //!< Planar YUV [Y plane followed by U and V planes]
+    SF_YUV444_16Bit = 3 //!< 16 bit Planar YUV [Y plane followed by U and V planes]. Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)
+};
+
+/** @brief Bit depth of the frame returned by VideoReader::nextFrame() and VideoReader::retrieve()  */
+enum BitDepth {
+    EIGHT = 0, //!< 8 bit depth.
+    SIXTEEN = 1, //!< 16 bit depth.
+    UNCHANGED = 2 //!< Use source bit depth.
 };
 
 /** @brief Utility function demonstrating how to map the luma histogram when FormatInfo::videoFullRangeFlag == false
@@ -316,7 +341,7 @@ enum DeinterlaceMode
 
     @note
     -   This function demonstrates how to map the luma histogram back so that it is equivalent to the result obtained from cuda::calcHist()
-    if the returned frame was colorFormat::GRAY.
+    if the returned frame was ColorFormat::GRAY.
  */
 CV_EXPORTS_W void MapHist(const cuda::GpuMat& hist, CV_OUT Mat& histFull);
 
@@ -325,10 +350,11 @@ CV_EXPORTS_W void MapHist(const cuda::GpuMat& hist, CV_OUT Mat& histFull);
 struct CV_EXPORTS_W_SIMPLE FormatInfo
 {
     CV_WRAP FormatInfo() : nBitDepthMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
-        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false), enableHistogram(false), nCounterBitDepth(0), nMaxHistogramBins(0){};
+        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false), colorSpaceStandard(ColorSpaceStandard::BT601), enableHistogram(false), nCounterBitDepth(0), nMaxHistogramBins(0){};
 
     CV_PROP_RW Codec codec;
     CV_PROP_RW ChromaFormat chromaFormat;
+    CV_PROP_RW SurfaceFormat surfaceFormat; //!< Surface format of the decoded frame.
     CV_PROP_RW int nBitDepthMinus8;
     CV_PROP_RW int nBitDepthChromaMinus8;
     CV_PROP_RW int ulWidth;//!< Coded sequence width in pixels.
@@ -345,12 +371,36 @@ struct CV_EXPORTS_W_SIMPLE FormatInfo
     CV_PROP_RW cv::Size targetSz;//!< Post-processed size of the output frame.
     CV_PROP_RW cv::Rect srcRoi;//!< Region of interest decoded from video source.
     CV_PROP_RW cv::Rect targetRoi;//!< Region of interest in the output frame containing the decoded frame.
-    CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.  Internally the conversion from NV12 to BGR obeys ITU 709.
+    CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.
+    CV_PROP_RW ColorSpaceStandard colorSpaceStandard; //!< Video Signal Description Color Primaries of the VideoReader source (section E.2.1 VUI parameters semantics of H265 spec file)
     CV_PROP_RW bool enableHistogram;//!< Flag requesting histogram output if supported. Exception will be thrown when requested but not supported.
     CV_PROP_RW int nCounterBitDepth;//!< Bit depth of histogram bins if histogram output is requested and supported.
     CV_PROP_RW int nMaxHistogramBins;//!< Max number of histogram bins if histogram output is requested and supported.
 };
 
+/** @brief Class for converting the raw YUV Surface output from VideoReader if output color format is set to ColorFormat::NV_YUV_SURFACE_FORMAT (VideoReader::set(ColorFormat::NV_YUV_SURFACE_FORMAT)) to the requested @ref ColorFormat.
+ */
+class CV_EXPORTS_W NVSurfaceToColorConverter {
+public:
+    /** @brief Performs the conversion from the raw YUV Surface output from VideoReader to the requested color format. Use this function when you want to convert the raw YUV Surface output from VideoReader to more than one color format or you want both the raw Surface output in addition to a color frame.
+     * @param yuv The raw YUV Surface output from VideoReader see @ref SurfaceFormat.
+     * @param color The converted frame.
+     * @param surfaceFormat The surface format of the input YUV data.
+     * @param outputFormat The requested output color format.
+     * @param bitDepth The requested bit depth of the output frame.
+     * @param planar Request seperate planes for each color plane.
+     * @param videoFullRangeFlag Indicates if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.
+     * @param stream Stream for the asynchronous version.
+     */
+    virtual bool convert(InputArray yuv, OutputArray color, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false, const bool videoFullRangeFlag = false, cuda::Stream& stream = cuda::Stream::Null()) = 0;
+};
+
+/** @brief Creates a NVSurfaceToColorConverter.
+* @param colorSpace The requested @ref ColorSpaceStandard for the converter.
+* @param videoFullRangeFlag Indicates if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.
+ */
+CV_EXPORTS_W Ptr<NVSurfaceToColorConverter> createNVSurfaceToColorConverter(const ColorSpaceStandard colorSpace, const bool videoFullRangeFlag = false);
+
 /** @brief cv::cudacodec::VideoReader generic properties identifier.
 */
 enum class VideoReaderProps {
@@ -360,9 +410,11 @@ enum class VideoReaderProps {
     PROP_NUMBER_OF_RAW_PACKAGES_SINCE_LAST_GRAB = 3, //!< Number of raw packages recieved since the last call to grab().
     PROP_RAW_MODE = 4, //!< Status of raw mode.
     PROP_LRF_HAS_KEY_FRAME = 5, //!< FFmpeg source only - Indicates whether the Last Raw Frame (LRF), output from VideoReader::retrieve() when VideoReader is initialized in raw mode, contains encoded data for a key frame.
-    PROP_COLOR_FORMAT = 6, //!< Set the ColorFormat of the decoded frame.  This can be changed before every call to nextFrame() and retrieve().
+    PROP_COLOR_FORMAT = 6, //!< ColorFormat of the decoded frame.  This can be changed before every call to nextFrame() and retrieve().
     PROP_UDP_SOURCE = 7, //!< Status of VideoReaderInitParams::udpSource initialization.
     PROP_ALLOW_FRAME_DROP = 8, //!< Status of VideoReaderInitParams::allowFrameDrop initialization.
+    PROP_BIT_DEPTH = 9, //!< Bit depth of the decoded frame. This can be changed before every call to nextFrame() and retrieve().
+    PROP_PLANAR = 10, //!< Planar when true, packed when false. This can be changed before every call to nextFrame() and retrieve().
 #ifndef CV_DOXYGEN
     PROP_NOT_SUPPORTED
 #endif
@@ -481,9 +533,11 @@ class CV_EXPORTS_W VideoReader
     /** @brief Set the desired ColorFormat for the frame returned by nextFrame()/retrieve().
 
     @param colorFormat Value of the ColorFormat.
+    @param bitDepth Requested bit depth of the frame.
+    @param planar Set to true for planar and false for packed color format.
     @return `true` unless the colorFormat is not supported.
      */
-    CV_WRAP virtual bool set(const ColorFormat colorFormat) = 0;
+    CV_WRAP virtual bool set(const ColorFormat colorFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false) = 0;
 
     /** @brief Returns the specified VideoReader property
 
diff --git a/modules/cudacodec/src/cuda/ColorSpace.cu b/modules/cudacodec/src/cuda/ColorSpace.cu
new file mode 100644
index 00000000000..137805af392
--- /dev/null
+++ b/modules/cudacodec/src/cuda/ColorSpace.cu
@@ -0,0 +1,762 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ColorSpace.h"
+#include <opencv2/cudacodec.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+namespace cv { namespace cuda { namespace device {
+
+__constant__ float matYuv2Color[3][3];
+
+void inline GetConstants(int iMatrix, float& wr, float& wb, int& black, int& white, int& uvWhite, int& max, bool fullRange = false) {
+    if (fullRange) {
+        black = 0; white = 255; uvWhite = 255;
+    }
+    else {
+        black = 16; white = 235; uvWhite = 240;
+    }
+    max = 255;
+
+    switch (static_cast<cv::cudacodec::ColorSpaceStandard>(iMatrix))
+    {
+    case cv::cudacodec::ColorSpaceStandard::BT709:
+    default:
+        wr = 0.2126f; wb = 0.0722f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::FCC:
+        wr = 0.30f; wb = 0.11f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::BT470:
+    case cv::cudacodec::ColorSpaceStandard::BT601:
+        wr = 0.2990f; wb = 0.1140f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::SMPTE240M:
+        wr = 0.212f; wb = 0.087f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::BT2020:
+    case cv::cudacodec::ColorSpaceStandard::BT2020C:
+        wr = 0.2627f; wb = 0.0593f;
+        // 10-bit only
+        black = 64 << 6; white = 940 << 6;
+        max = (1 << 16) - 1;
+        break;
+    }
+}
+
+void SetMatYuv2Rgb(int iMatrix, bool fullRange = false) {
+    float wr, wb;
+    int black, white, max, uvWhite;
+    GetConstants(iMatrix, wr, wb, black, white, uvWhite, max, fullRange);
+    float mat[3][3] = {
+        1.0f, 0.0f, (1.0f - wr) / 0.5f,
+        1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr),
+        1.0f, (1.0f - wb) / 0.5f, 0.0f,
+    };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            if (j == 0)
+                mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
+            else
+                mat[i][j] = (float)(1.0 * max / (uvWhite - black) * mat[i][j]);
+        }
+    }
+    cudaMemcpyToSymbol(matYuv2Color, mat, sizeof(mat));
+}
+
+template<class T>
+__device__ static T Clamp(T x, T lower, T upper) {
+    return x < lower ? lower : (x > upper ? upper : x);
+}
+
+template<class Gray, class YuvUnit>
+__device__ inline Gray YToGrayForPixel(YuvUnit y, bool videoFullRangeFlag) {
+    const int low = videoFullRangeFlag ? 0 : 1 << (sizeof(YuvUnit) * 8 - 4);
+    float fy = (int)y - low;
+    const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
+
+    YuvUnit g = (YuvUnit)Clamp(matYuv2Color[0][0] * fy, 0.0f, maxf);
+    const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(Gray)) * 8;
+    Gray gray{};
+    if (sizeof(YuvUnit) >= sizeof(Gray))
+        gray = g >> nShift;
+    else
+        gray = g << nShift;
+    return gray;
+}
+
+template<class Color, class YuvUnit>
+__device__ inline Color YuvToColorForPixel(YuvUnit y, YuvUnit u, YuvUnit v, bool videoFullRangeFlag) {
+    const int
+        low = videoFullRangeFlag ? 0 : 1 << (sizeof(YuvUnit) * 8 - 4),
+        mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+    float fy = (int)y - low, fu = (int)u - mid, fv = (int)v - mid;
+    const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
+    YuvUnit
+        r = (YuvUnit)Clamp(matYuv2Color[0][0] * fy + matYuv2Color[0][1] * fu + matYuv2Color[0][2] * fv, 0.0f, maxf),
+        g = (YuvUnit)Clamp(matYuv2Color[1][0] * fy + matYuv2Color[1][1] * fu + matYuv2Color[1][2] * fv, 0.0f, maxf),
+        b = (YuvUnit)Clamp(matYuv2Color[2][0] * fy + matYuv2Color[2][1] * fu + matYuv2Color[2][2] * fv, 0.0f, maxf);
+
+    Color color{};
+    const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(color.c.r)) * 8;
+    if (sizeof(YuvUnit) >= sizeof(color.c.r)) {
+        color.c.r = r >> nShift;
+        color.c.g = g >> nShift;
+        color.c.b = b >> nShift;
+    }
+    else {
+        color.c.r = r << nShift;
+        color.c.g = g << nShift;
+        color.c.b = b << nShift;
+    }
+    return color;
+}
+
+template<class Color, class YuvUnit>
+__device__ inline Color YuvToColoraForPixel(YuvUnit y, YuvUnit u, YuvUnit v, bool videoFullRangeFlag) {
+    Color color = YuvToColorForPixel<Color>(y, u, v, videoFullRangeFlag);
+    const float maxf = (1 << sizeof(color.c.r) * 8) - 1.0f;
+    color.c.a = maxf;
+    return color;
+}
+
+template<class Yuvx2, class Gray, class Grayx2>
+__global__ static void YToGrayKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(Yuvx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pGray + x * sizeof(Gray) + y * nGrayPitch;
+
+    Yuvx2 l0 = *(Yuvx2*)pSrc;
+    *(Grayx2*)pDst = Grayx2{
+        YToGrayForPixel<Gray>(l0.x, videoFullRangeFlag),
+        YToGrayForPixel<Gray>(l0.y, videoFullRangeFlag),
+    };
+}
+
+template<class Yuvx2, class Color, class Colorx2>
+__global__ static void YuvToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(Yuvx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    Yuvx2 l0 = *(Yuvx2*)pSrc;
+    Yuvx2 l1 = *(Yuvx2*)(pSrc + nYuvPitch);
+    Yuvx2 ch = *(Yuvx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    union ColorOutx2 {
+        Colorx2 d;
+        Color Color[2];
+    };
+    ColorOutx2 l1Out;
+    l1Out.Color[0] = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag);
+    l1Out.Color[1] = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag);
+    *(Colorx2*)pDst = l1Out.d;
+    ColorOutx2 l2Out;
+    l2Out.Color[0] = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag);
+    l2Out.Color[1] = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+    *(Colorx2*)(pDst + nColorPitch) = l2Out.d;
+}
+
+template<class YuvUnitx2, class Color, class ColorIntx2>
+__global__ static void YuvToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    *(ColorIntx2*)pDst = ColorIntx2{
+        YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag).d,
+    };
+    *(ColorIntx2*)(pDst + nColorPitch) = ColorIntx2{
+        YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag).d,
+    };
+}
+
+template<class YuvUnitx2, class Color, class Colorx2>
+__global__ static void Yuv444ToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    union ColorOutx2 {
+        Colorx2 d;
+        Color Color[2];
+    };
+    ColorOutx2 out;
+    out.Color[0] = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag);
+    out.Color[1] = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
+    *(Colorx2*)pDst = out.d;
+}
+
+template<class YuvUnitx2, class Color, class ColorIntx2>
+__global__ static void Yuv444ToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    *(ColorIntx2*)pDst = ColorIntx2{
+        YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag).d,
+    };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void YuvToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    Color color0 = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag),
+        color1 = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag),
+        color2 = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag),
+        color3 = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.x, color3.v.x };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.y, color3.v.y };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.z, color3.v.z };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void YuvToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag),
+        color1 = YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag),
+        color2 = YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag),
+        color3 = YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.x, color3.v.x };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.y, color3.v.y };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.z, color3.v.z };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.w, color1.v.w };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.w, color3.v.w };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void Yuv444ToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    Color color0 = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag),
+        color1 = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
+
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void Yuv444ToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag),
+        color1 = YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
+
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.w, color1.v.w };
+}
+
+#define BLOCKSIZE_X 32
+#define BLOCKSIZE_Y 8
+
+void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<uchar2, unsigned char, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<uchar2, unsigned short, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<ushort2, unsigned char, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<ushort2, unsigned short, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<uchar2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<uchar2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<uchar2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<uchar2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<uchar2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<uchar2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<uchar2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<uchar2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<ushort2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<ushort2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<ushort2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<ushort2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444P16ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<ushort2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444P16ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<ushort2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444P16ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<ushort2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444P16ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<ushort2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<uchar2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<uchar2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<uchar2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<uchar2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<ushort2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<ushort2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<ushort2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<ushort2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<uchar2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<uchar2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<uchar2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<uchar2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444P16ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<ushort2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444P16ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<ushort2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444P16ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<ushort2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444P16ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<ushort2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template void Nv12ToColor24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void Nv12ToColorPlanar24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void P016ToColor24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void P016ToColorPlanar24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444P16ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444P16ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+}}}
diff --git a/modules/cudacodec/src/cuda/ColorSpace.h b/modules/cudacodec/src/cuda/ColorSpace.h
new file mode 100644
index 00000000000..d730aa37fd1
--- /dev/null
+++ b/modules/cudacodec/src/cuda/ColorSpace.h
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#pragma once
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+namespace cv { namespace cuda { namespace device {
+union BGR24 {
+    uchar3 v;
+    struct {
+        uint8_t b, g, r;
+    } c;
+};
+
+union RGB24 {
+    uchar3 v;
+    struct {
+        uint8_t r, g, b;
+    } c;
+};
+
+union BGRA32 {
+    uint32_t d;
+    uchar4 v;
+    struct {
+        uint8_t b, g, r, a;
+    } c;
+};
+
+union RGBA32 {
+    uint32_t d;
+    uchar4 v;
+    struct {
+        uint8_t r, g, b, a;
+    } c;
+};
+
+union BGR48 {
+    ushort3 v;
+    struct {
+        uint16_t b, g, r;
+    } c;
+};
+
+union RGB48 {
+    ushort3 v;
+    struct {
+        uint16_t r, g, b;
+    } c;
+};
+
+union BGRA64 {
+    uint64_t d;
+    ushort4 v;
+    struct {
+        uint16_t b, g, r, a;
+    } c;
+};
+
+union RGBA64 {
+    uint64_t d;
+    ushort4 v;
+    struct {
+        uint16_t r, g, b, a;
+    } c;
+};
+}}}
diff --git a/modules/cudacodec/src/cuda/nv12_to_rgb.cu b/modules/cudacodec/src/cuda/nv12_to_rgb.cu
deleted file mode 100644
index a9031e0ec9e..00000000000
--- a/modules/cudacodec/src/cuda/nv12_to_rgb.cu
+++ /dev/null
@@ -1,190 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/*
- * NV12ToARGB color space conversion CUDA kernel
- *
- * This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
- * source and converts to output in ARGB format
- */
-
-#include "opencv2/opencv_modules.hpp"
-
-#ifndef HAVE_OPENCV_CUDEV
-
-#error "opencv_cudev is required"
-
-#else
-
-#include "opencv2/cudev/common.hpp"
-
-using namespace cv;
-using namespace cv::cudev;
-
-void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, cudaStream_t stream);
-
-namespace
-{
-    __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
-
-    template<bool fullRange>
-    __device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
-    {
-        float luma, chromaCb, chromaCr;
-        if (fullRange) {
-            luma = (float)(((int)yuvi[0] * 219.0f / 255.0f));
-            chromaCb = (float)(((int)yuvi[1] - 512.0f) * 224.0f / 255.0f);
-            chromaCr = (float)(((int)yuvi[2] - 512.0f) * 224.0f / 255.0f);
-        }
-        else {
-            luma = (float)((int)yuvi[0] - 64.0f);
-            chromaCb = (float)((int)yuvi[1] - 512.0f);
-            chromaCr = (float)((int)yuvi[2] - 512.0f);
-        }
-
-       // Convert YUV To RGB with hue adjustment
-       *red   = (luma     * constHueColorSpaceMat[0]) +
-                (chromaCb * constHueColorSpaceMat[1]) +
-                (chromaCr * constHueColorSpaceMat[2]);
-
-       *green = (luma     * constHueColorSpaceMat[3]) +
-                (chromaCb * constHueColorSpaceMat[4]) +
-                (chromaCr * constHueColorSpaceMat[5]);
-
-       *blue  = (luma     * constHueColorSpaceMat[6]) +
-                (chromaCb * constHueColorSpaceMat[7]) +
-                (chromaCr * constHueColorSpaceMat[8]);
-    }
-
-    __device__ static uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
-    {
-        uint ARGBpixel = 0;
-
-        // Clamp final 10 bit results
-        red   = ::fmin(::fmax(red,   0.0f), 1023.f);
-        green = ::fmin(::fmax(green, 0.0f), 1023.f);
-        blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
-
-        // Convert to 8 bit unsigned integers per color component
-        ARGBpixel = (((uint)blue  >> 2) |
-                    (((uint)green >> 2) << 8)  |
-                    (((uint)red   >> 2) << 16) |
-                    (uint)alpha);
-
-        return ARGBpixel;
-    }
-
-    // CUDA kernel for outputting the final ARGB output from NV12
-
-    #define COLOR_COMPONENT_BIT_SIZE 10
-    #define COLOR_COMPONENT_MASK     0x3FF
-
-    template<bool fullRange>
-    __global__ void NV12_to_BGRA(const uchar* srcImage, size_t nSourcePitch,
-                                  uint* dstImage, size_t nDestPitch,
-                                  uint width, uint height)
-    {
-        // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
-        const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
-        const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
-
-        if (x >= width || y >= height)
-            return;
-
-        // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
-        // if we move to texture we could read 4 luminance values
-
-        uint yuv101010Pel[2];
-
-        yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
-        yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
-
-        const size_t chromaOffset = nSourcePitch * height;
-
-        const int y_chroma = y >> 1;
-
-        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-        // this steps performs the color conversion
-        uint yuvi[6];
-        float red[2], green[2], blue[2];
-
-        yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
-        yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-        yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-        yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
-        yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-        yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-        // YUV to RGB Transformation conversion
-        YUV2RGB<fullRange>(&yuvi[0], &red[0], &green[0], &blue[0]);
-        YUV2RGB<fullRange>(&yuvi[3], &red[1], &green[1], &blue[1]);
-
-        // Clamp the results to RGBA
-
-        const size_t dstImagePitch = nDestPitch >> 2;
-
-        dstImage[y * dstImagePitch + x     ] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24));
-        dstImage[y * dstImagePitch + x + 1 ] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24));
-    }
-}
-
-void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream)
-{
-    outFrame.create(height, width, CV_8UC4);
-    dim3 block(32, 8);
-    dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y));
-    if (videoFullRangeFlag)
-        NV12_to_BGRA<true> <<<grid, block, 0, stream >>> (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
-    else
-        NV12_to_BGRA<false> <<<grid, block, 0, stream >>> (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
-    CV_CUDEV_SAFE_CALL(cudaGetLastError());
-    if (stream == 0)
-        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
-}
-
-#endif
diff --git a/modules/cudacodec/src/cuda/rgb_to_yv12.cu b/modules/cudacodec/src/cuda/rgb_to_yv12.cu
deleted file mode 100644
index ed0e0df9ba8..00000000000
--- a/modules/cudacodec/src/cuda/rgb_to_yv12.cu
+++ /dev/null
@@ -1,167 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/opencv_modules.hpp"
-
-#ifndef HAVE_OPENCV_CUDEV
-
-#error "opencv_cudev is required"
-
-#else
-
-#include "opencv2/cudev/ptr2d/glob.hpp"
-
-using namespace cv::cudev;
-
-void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
-
-namespace
-{
-    __device__ __forceinline__ void rgb_to_y(const uchar b, const uchar g, const uchar r, uchar& y)
-    {
-        y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
-    }
-
-    __device__ __forceinline__ void rgb_to_yuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
-    {
-        rgb_to_y(b, g, r, y);
-        u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
-        v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
-    }
-
-    __global__ void Gray_to_YV12(const GlobPtrSz<uchar> src, GlobPtr<uchar> dst)
-    {
-        const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-        const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-        if (x + 1 >= src.cols || y + 1 >= src.rows)
-            return;
-
-        // get pointers to the data
-        const size_t planeSize = src.rows * dst.step;
-        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
-        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
-        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
-
-        uchar pix;
-        uchar y_val, u_val, v_val;
-
-        pix = src(y, x);
-        rgb_to_y(pix, pix, pix, y_val);
-        y_plane(y, x) = y_val;
-
-        pix = src(y, x + 1);
-        rgb_to_y(pix, pix, pix, y_val);
-        y_plane(y, x + 1) = y_val;
-
-        pix = src(y + 1, x);
-        rgb_to_y(pix, pix, pix, y_val);
-        y_plane(y + 1, x) = y_val;
-
-        pix = src(y + 1, x + 1);
-        rgb_to_yuv(pix, pix, pix, y_val, u_val, v_val);
-        y_plane(y + 1, x + 1) = y_val;
-        u_plane(y / 2, x / 2) = u_val;
-        v_plane(y / 2, x / 2) = v_val;
-    }
-
-    template <typename T>
-    __global__ void RGB_to_YV12(const GlobPtrSz<T> src, GlobPtr<uchar> dst)
-    {
-        const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-        const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-        if (x + 1 >= src.cols || y + 1 >= src.rows)
-            return;
-
-        // get pointers to the data
-        const size_t planeSize = src.rows * dst.step;
-        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
-        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
-        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
-
-        T pix;
-        uchar y_val, u_val, v_val;
-
-        pix = src(y, x);
-        rgb_to_y(pix.z, pix.y, pix.x, y_val);
-        y_plane(y, x) = y_val;
-
-        pix = src(y, x + 1);
-        rgb_to_y(pix.z, pix.y, pix.x, y_val);
-        y_plane(y, x + 1) = y_val;
-
-        pix = src(y + 1, x);
-        rgb_to_y(pix.z, pix.y, pix.x, y_val);
-        y_plane(y + 1, x) = y_val;
-
-        pix = src(y + 1, x + 1);
-        rgb_to_yuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
-        y_plane(y + 1, x + 1) = y_val;
-        u_plane(y / 2, x / 2) = u_val;
-        v_plane(y / 2, x / 2) = v_val;
-    }
-}
-
-void RGB_to_YV12(const GpuMat& src, GpuMat& dst)
-{
-    const dim3 block(32, 8);
-    const dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-    switch (src.channels())
-    {
-    case 1:
-        Gray_to_YV12<<<grid, block>>>(globPtr<uchar>(src), globPtr<uchar>(dst));
-        break;
-    case 3:
-        RGB_to_YV12<<<grid, block>>>(globPtr<uchar3>(src), globPtr<uchar>(dst));
-        break;
-    case 4:
-        RGB_to_YV12<<<grid, block>>>(globPtr<uchar4>(src), globPtr<uchar>(dst));
-        break;
-    }
-
-    CV_CUDEV_SAFE_CALL( cudaGetLastError() );
-    CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-}
-
-#endif
diff --git a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
new file mode 100644
index 00000000000..e22549e2296
--- /dev/null
+++ b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
@@ -0,0 +1,205 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudacodec;
+
+#if !defined (HAVE_CUDA)
+Ptr<NVSurfaceToColorConverter> cv::cudacodec::createNVSurfaceToColorConverter(const ColorSpaceStandard, const bool){ throw_no_cuda(); }
+#else
+#include "cuda/ColorSpace.h"
+namespace cv { namespace cuda { namespace device {
+template<class BGR24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag);
+template<class BGR48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+void SetMatYuv2Rgb(int iMatrix, bool);
+}}}
+
+using namespace cuda::device;
+class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
+public:
+    NVSurfaceToColorConverterImpl(ColorSpaceStandard colorSpace, bool fullColorRange = false) {
+        SetMatYuv2Rgb(static_cast<int>(colorSpace), fullColorRange);
+    }
+
+    int OutputColorFormatIdx(const cudacodec::ColorFormat format) {
+        switch (format) {
+        case cudacodec::ColorFormat::BGR: return 0;
+        case cudacodec::ColorFormat::RGB: return 1;
+        case cudacodec::ColorFormat::BGRA: return 2;
+        case cudacodec::ColorFormat::RGBA: return 3;
+        case cudacodec::ColorFormat::GRAY: return 4;
+        default: return -1;
+        }
+    }
+
+    int NumChannels(const cudacodec::ColorFormat format) {
+        switch (format) {
+        case cudacodec::ColorFormat::BGR:
+        case cudacodec::ColorFormat::RGB: return 3;
+        case cudacodec::ColorFormat::BGRA:
+        case cudacodec::ColorFormat::RGBA: return 4;
+        case cudacodec::ColorFormat::GRAY: return 1;
+        default: return -1;
+        }
+    }
+
+    BitDepth GetBitDepthOut(const BitDepth bitDepth, const int nBitsIn) {
+        switch (bitDepth) {
+        case BitDepth::EIGHT:
+        case BitDepth::SIXTEEN:
+            return bitDepth;
+        case BitDepth::UNCHANGED:
+        default:
+            if (nBitsIn == CV_8U)
+                return BitDepth::EIGHT;
+            else
+                return BitDepth::SIXTEEN;
+        }
+    }
+
+    bool convert(const InputArray yuv, const OutputArray out, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth, const bool planar, const bool videoFullRangeFlag, cuda::Stream& stream) {
+        CV_Assert(outputFormat == ColorFormat::BGR || outputFormat == ColorFormat::BGRA || outputFormat == ColorFormat::RGB || outputFormat == ColorFormat::RGBA || outputFormat == ColorFormat::GRAY);
+        CV_Assert(yuv.depth() == CV_8U || yuv.depth() == CV_16U);
+        const bool yuv420 = surfaceFormat == SurfaceFormat::SF_NV12 || surfaceFormat == SurfaceFormat::SF_P016;
+        CV_Assert(yuv.cols() % 2 == 0);
+
+        typedef void (*func_t)(uint8_t* yuv, int yuvPitch, uint8_t* color, int colorPitch, int width, int height, bool videoFullRangeFlag, cudaStream_t stream);
+        static const func_t funcs[4][5][2][2] =
+        {
+            {
+                {{{Nv12ToColor24<BGR24>},{Nv12ToColorPlanar24<BGR24>}},{{Nv12ToColor48<BGR48>},{Nv12ToColorPlanar48<BGR48>}}},
+                {{{Nv12ToColor24<RGB24>},{Nv12ToColorPlanar24<RGB24>}},{{Nv12ToColor48<RGB48>},{Nv12ToColorPlanar48<RGB48>}}},
+                {{{Nv12ToColor32<BGRA32>},{Nv12ToColorPlanar32<BGRA32>}},{{Nv12ToColor64<BGRA64>},{Nv12ToColorPlanar64<BGRA64>}}},
+                {{{Nv12ToColor32<RGBA32>},{Nv12ToColorPlanar32<RGBA32>}},{{Nv12ToColor64<RGBA64>},{Nv12ToColorPlanar64<RGBA64>}}},
+                {{{Y8ToGray8},{Y8ToGray8}},{{Y8ToGray16},{Y8ToGray16}}}
+            },
+            {
+                {{{P016ToColor24<BGR24>},{P016ToColorPlanar24<BGR24>}},{{P016ToColor48<BGR48>},{P016ToColorPlanar48<BGR48>}}},
+                {{{P016ToColor24<RGB24>},{P016ToColorPlanar24<RGB24>}},{{P016ToColor48<RGB48>},{P016ToColorPlanar48<RGB48>}}},
+                {{{P016ToColor32<BGRA32>},{P016ToColorPlanar32<BGRA32>}},{{P016ToColor64<BGRA64>},{P016ToColorPlanar64<BGRA64>}}},
+                {{{P016ToColor32<RGBA32>},{P016ToColorPlanar32<RGBA32>}},{{P016ToColor64<RGBA64>},{P016ToColorPlanar64<RGBA64>}}},
+                {{{Y16ToGray8},{Y16ToGray8}},{{Y16ToGray16},{Y16ToGray16}}}
+            },
+            {
+                {{{YUV444ToColor24<BGR24>},{YUV444ToColorPlanar24<BGR24>}},{{YUV444ToColor48<BGR48>},{YUV444ToColorPlanar48<BGR48>}}},
+                {{{YUV444ToColor24<RGB24>},{YUV444ToColorPlanar24<RGB24>}},{{YUV444ToColor48<RGB48>},{YUV444ToColorPlanar48<RGB48>}}},
+                {{{YUV444ToColor32<BGRA32>},{YUV444ToColorPlanar32<BGRA32>}},{{YUV444ToColor64<BGRA64>},{YUV444ToColorPlanar64<BGRA64>}}},
+                {{{YUV444ToColor32<RGBA32>},{YUV444ToColorPlanar32<RGBA32>}},{{YUV444ToColor64<RGBA64>},{YUV444ToColorPlanar64<RGBA64>}}},
+                {{{Y8ToGray8},{Y8ToGray8}},{{Y8ToGray16},{Y8ToGray16}}}
+            },
+            {
+                {{{YUV444P16ToColor24<BGR24>},{YUV444P16ToColorPlanar24<BGR24>}},{{YUV444P16ToColor48<BGR48>},{YUV444P16ToColorPlanar48<BGR48>}}},
+                {{{YUV444P16ToColor24<RGB24>},{YUV444P16ToColorPlanar24<RGB24>}},{{YUV444P16ToColor48<RGB48>},{YUV444P16ToColorPlanar48<RGB48>}}},
+                {{{YUV444P16ToColor32<BGRA32>},{YUV444P16ToColorPlanar32<BGRA32>}},{{YUV444P16ToColor64<BGRA64>},{YUV444P16ToColorPlanar64<BGRA64>}}},
+                {{{YUV444P16ToColor32<RGBA32>},{YUV444P16ToColorPlanar32<RGBA32>}},{{YUV444P16ToColor64<RGBA64>},{YUV444P16ToColorPlanar64<RGBA64>}}},
+                {{{Y16ToGray8},{Y16ToGray8}},{{Y16ToGray16},{Y16ToGray16}}}
+            }
+        };
+
+        GpuMat yuv_ = getInputMat(yuv, stream);
+        CV_Assert(yuv_.step <= static_cast<size_t>(std::numeric_limits<int>::max()));
+
+        const int nRows = static_cast<int>(yuv.rows() / (yuv420 ? 1.5f : 3.0f));
+        CV_Assert(!yuv420 || nRows % 2 == 0);
+        const int nChannels = NumChannels(outputFormat);
+        const int nRowsOut = nRows * (planar ? nChannels : 1);
+        const BitDepth bitDepth_ = GetBitDepthOut(bitDepth, yuv.depth());
+        const int typeOut = CV_MAKE_TYPE(bitDepth_ == BitDepth::EIGHT ? CV_8U : CV_16U, planar ? 1 : nChannels);
+        GpuMat out_ = getOutputMat(out, nRowsOut, yuv.cols(), typeOut, stream);
+
+        const int iOutputFormat = OutputColorFormatIdx(outputFormat);
+        const func_t func = funcs[static_cast<int>(surfaceFormat)][iOutputFormat][static_cast<int>(bitDepth_)][planar];
+        if (!func)
+            CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+        CV_Assert(out_.step <= static_cast<size_t>(std::numeric_limits<int>::max()));
+        func((uint8_t*)yuv_.ptr(0), static_cast<int>(yuv_.step), (uint8_t*)out_.ptr(0), static_cast<int>(out_.step), out_.cols, nRows, videoFullRangeFlag, StreamAccessor::getStream(stream));
+        return true;
+    }
+
+
+};
+
+Ptr<NVSurfaceToColorConverter> cv::cudacodec::createNVSurfaceToColorConverter(const ColorSpaceStandard colorSpace, const bool videoFullRangeFlag) {
+    return makePtr<NVSurfaceToColorConverterImpl>(colorSpace, videoFullRangeFlag);
+}
+#endif
diff --git a/modules/cudacodec/src/precomp.hpp b/modules/cudacodec/src/precomp.hpp
index 004cf85c88d..99a788a0128 100644
--- a/modules/cudacodec/src/precomp.hpp
+++ b/modules/cudacodec/src/precomp.hpp
@@ -82,7 +82,6 @@
         #include "frame_queue.hpp"
         #include "video_decoder.hpp"
         #include "video_parser.hpp"
-        #include <opencv2/cudaarithm.hpp>
     #endif
     #if defined(HAVE_NVCUVENC)
         #include <fstream>
diff --git a/modules/cudacodec/src/video_decoder.cpp b/modules/cudacodec/src/video_decoder.cpp
index 10008d9b033..e156e25a705 100644
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -45,6 +45,7 @@
 
 #ifdef HAVE_NVCUVID
 
+#if (CUDART_VERSION < 9000)
 static const char* GetVideoChromaFormatString(cudaVideoChromaFormat eChromaFormat) {
     static struct {
         cudaVideoChromaFormat eChromaFormat;
@@ -61,6 +62,7 @@ static const char* GetVideoChromaFormatString(cudaVideoChromaFormat eChromaForma
     }
     return "Unknown";
 }
+#endif
 
 void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
 {
@@ -68,16 +70,30 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
         AutoLock autoLock(mtx_);
         videoFormat_ = videoFormat;
     }
-    const cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat.codec);
-    const cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);
+    const cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat_.codec);
+    const cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat_.chromaFormat);
+
+    cudaVideoSurfaceFormat surfaceFormat = cudaVideoSurfaceFormat_NV12;
+#if (CUDART_VERSION < 9000)
     if (videoFormat.nBitDepthMinus8 > 0) {
-        std::ostringstream warning;
-        warning << "NV12 (8 bit luma, 4 bit chroma) is currently the only supported decoder output format. Video input is " << videoFormat.nBitDepthMinus8 + 8 << " bit " \
+    std::ostringstream warning;
+    warning << "NV12 (8 bit luma, 4 bit chroma) is currently the only supported decoder output format. Video input is " << videoFormat.nBitDepthMinus8 + 8 << " bit " \
             << std::string(GetVideoChromaFormatString(_chromaFormat)) << ".  Truncating luma to 8 bits";
         if (videoFormat.chromaFormat != YUV420)
             warning << " and chroma to 4 bits";
         CV_LOG_WARNING(NULL, warning.str());
     }
+#else
+    if (_chromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome)
+        surfaceFormat = videoFormat_.nBitDepthMinus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+    else if (_chromaFormat == cudaVideoChromaFormat_444)
+        surfaceFormat = videoFormat_.nBitDepthMinus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444;
+    else if (_chromaFormat == cudaVideoChromaFormat_422) {
+        surfaceFormat = videoFormat_.nBitDepthMinus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+        CV_LOG_WARNING(NULL, "YUV 4:2:2 is not currently supported, falling back to YUV 4:2:0.");
+    }
+#endif
+
     const cudaVideoCreateFlags videoCreateFlags = (_codec == cudaVideoCodec_JPEG || _codec == cudaVideoCodec_MPEG2) ?
                                             cudaVideoCreate_PreferCUDA :
                                             cudaVideoCreate_PreferCUVID;
@@ -123,9 +139,25 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
     cuSafeCall(cuCtxPushCurrent(ctx_));
     cuSafeCall(cuvidGetDecoderCaps(&decodeCaps));
     cuSafeCall(cuCtxPopCurrent(NULL));
-    if (!(decodeCaps.bIsSupported && (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)))) {
-        CV_Error(Error::StsUnsupportedFormat, "Video source is not supported by hardware video decoder refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of the video source's codec.");
+
+    if (!decodeCaps.bIsSupported) {
+        CV_Error(Error::StsUnsupportedFormat, "Video codec is not supported by this GPU hardware video decoder refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of the video source's codec.");
+    }
+
+    if (!(decodeCaps.nOutputFormatMask & (1 << surfaceFormat)))
+    {
+        if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12))
+            surfaceFormat = cudaVideoSurfaceFormat_NV12;
+        else if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016))
+            surfaceFormat = cudaVideoSurfaceFormat_P016;
+        else if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444))
+            surfaceFormat = cudaVideoSurfaceFormat_YUV444;
+        else if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit))
+            surfaceFormat = cudaVideoSurfaceFormat_YUV444_16Bit;
+        else
+            CV_Error(Error::StsUnsupportedFormat, "No supported output format found");
     }
+    videoFormat_.surfaceFormat = static_cast<SurfaceFormat>(surfaceFormat);
 
     if (videoFormat.enableHistogram) {
         if (!decodeCaps.bIsHistogramSupported) {
@@ -168,7 +200,7 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
     createInfo_.ulHeight            = videoFormat.ulHeight;
     createInfo_.ulNumDecodeSurfaces = videoFormat.ulNumDecodeSurfaces;
     createInfo_.ChromaFormat    = _chromaFormat;
-    createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
+    createInfo_.OutputFormat    = surfaceFormat;
     createInfo_.DeinterlaceMode = static_cast<cudaVideoDeinterlaceMode>(videoFormat.deinterlaceMode);
     createInfo_.ulTargetWidth       = videoFormat.width;
     createInfo_.ulTargetHeight      = videoFormat.height;
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
index bea15369011..f77d288051b 100644
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -103,7 +103,9 @@ class VideoDecoder
 
         cuSafeCall( cuvidMapVideoFrame(decoder_, picIdx, &ptr, &pitch, &videoProcParams) );
 
-        return cuda::GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
+        const int height = (videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_NV12 || videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_P016) ? targetHeight() * 3 / 2 : targetHeight() * 3;
+        const int type = (videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_NV12 || videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_YUV444) ? CV_8U : CV_16U;
+        return cuda::GpuMat(height, targetWidth(), type, (void*) ptr, pitch);
     }
 
     void unmapFrame(cuda::GpuMat& frame)
diff --git a/modules/cudacodec/src/video_parser.cpp b/modules/cudacodec/src/video_parser.cpp
index 1aba16d585e..597845f01dd 100644
--- a/modules/cudacodec/src/video_parser.cpp
+++ b/modules/cudacodec/src/video_parser.cpp
@@ -119,6 +119,7 @@ int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userDa
 
     FormatInfo newFormat;
     newFormat.videoFullRangeFlag = format->video_signal_description.video_full_range_flag;
+    newFormat.colorSpaceStandard = static_cast<ColorSpaceStandard>(format->video_signal_description.matrix_coefficients);
     newFormat.codec = static_cast<Codec>(format->codec);
     newFormat.chromaFormat = static_cast<ChromaFormat>(format->chroma_format);
     newFormat.nBitDepthMinus8 = format->bit_depth_luma_minus8;
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index 5bf9aac91ed..28bbc113163 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -55,51 +55,6 @@ void cv::cudacodec::MapHist(const GpuMat&, Mat&) { throw_no_cuda(); }
 #else // HAVE_NVCUVID
 
 void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream);
-bool ValidColorFormat(const ColorFormat colorFormat);
-
-void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const ColorFormat colorFormat, const bool videoFullRangeFlag,
-    Stream stream)
-{
-    CV_Assert(decodedFrame.cols == width && decodedFrame.rows == height * 1.5f);
-    if (colorFormat == ColorFormat::BGRA) {
-        nv12ToBgra(decodedFrame, outFrame, width, height, videoFullRangeFlag, StreamAccessor::getStream(stream));
-    }
-    else if (colorFormat == ColorFormat::BGR) {
-        outFrame.create(height, width, CV_8UC3);
-        Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
-        NppiSize oSizeROI = { width,height };
-        cv::cuda::NppStreamHandler h(stream);
-#if USE_NPP_STREAM_CTX
-        if (videoFullRangeFlag)
-            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
-        else {
-#if (CUDART_VERSION < 11000)
-            nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
-#else
-            nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
-#endif
-        }
-#else
-        if (videoFullRangeFlag)
-            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
-        else {
-            nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
-        }
-#endif
-    }
-    else if (colorFormat == ColorFormat::GRAY) {
-        outFrame.create(height, width, CV_8UC1);
-        if(videoFullRangeFlag)
-            cudaSafeCall(cudaMemcpy2DAsync(outFrame.ptr(), outFrame.step, decodedFrame.ptr(), decodedFrame.step, width, height, cudaMemcpyDeviceToDevice, StreamAccessor::getStream(stream)));
-        else {
-            cv::cuda::subtract(decodedFrame(Rect(0,0,width,height)), 16, outFrame, noArray(), CV_8U, stream);
-            cv::cuda::multiply(outFrame, 255.0f / 219.0f, outFrame, 1.0, CV_8U, stream);
-        }
-    }
-    else if (colorFormat == ColorFormat::NV_NV12) {
-        decodedFrame.copyTo(outFrame, stream);
-    }
-}
 
 using namespace cv::cudacodec::detail;
 
@@ -124,7 +79,7 @@ namespace
 
         bool set(const VideoReaderProps propertyId, const double propertyVal) CV_OVERRIDE;
 
-        bool set(const ColorFormat colorFormat_) CV_OVERRIDE;
+        bool set(const ColorFormat colorFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false) CV_OVERRIDE;
 
         bool get(const VideoReaderProps propertyId, double& propertyVal) const CV_OVERRIDE;
         bool getVideoReaderProps(const VideoReaderProps propertyId, double& propertyValOut, double propertyValIn) const CV_OVERRIDE;
@@ -137,6 +92,7 @@ namespace
         void releaseFrameInfo(const std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo);
         bool internalGrab(GpuMat & frame, GpuMat & histogram, Stream & stream);
         void waitForDecoderInit();
+        void cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, const bool videoFullRangeFlag, Stream& stream);
 
         Ptr<VideoSource> videoSource_;
 
@@ -152,7 +108,10 @@ namespace
         static const int decodedFrameIdx = 0;
         static const int extraDataIdx = 1;
         static const int rawPacketsBaseIdx = 2;
+        Ptr<NVSurfaceToColorConverter> yuvConverter = 0;
         ColorFormat colorFormat = ColorFormat::BGRA;
+        BitDepth bitDepth = BitDepth::UNCHANGED;
+        bool planar = false;
         static const String errorMsg;
         int iFrame = 0;
     };
@@ -191,9 +150,17 @@ namespace
         videoSource_->setVideoParser(videoParser_);
         videoSource_->start();
         waitForDecoderInit();
+        FormatInfo format = videoDecoder_->format();
+        if (format.colorSpaceStandard == ColorSpaceStandard::Unspecified) {
+            if (format.width > 1280 || format.height > 720)
+                format.colorSpaceStandard = ColorSpaceStandard::BT709;
+            else
+                format.colorSpaceStandard = ColorSpaceStandard::BT601;
+        }
+        yuvConverter = createNVSurfaceToColorConverter(format.colorSpaceStandard, format.videoFullRangeFlag);
         for(iFrame = videoSource_->getFirstFrameIdx(); iFrame < firstFrameIdx; iFrame++)
             CV_Assert(skipFrame());
-        videoSource_->updateFormat(videoDecoder_->format());
+        videoSource_->updateFormat(format);
     }
 
     VideoReaderImpl::~VideoReaderImpl()
@@ -287,14 +254,13 @@ namespace
             // map decoded video frame to CUDA surface
             GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
 
-            cvtFromNv12(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, videoDecoder_->format().videoFullRangeFlag, stream);
-
             if (fmt.enableHistogram) {
                 const size_t histogramSz = 4 * fmt.nMaxHistogramBins;
                 histogram.create(1, fmt.nMaxHistogramBins, CV_32S);
                 cuSafeCall(cuMemcpyDtoDAsync((CUdeviceptr)(histogram.data), cuHistogramPtr, histogramSz, StreamAccessor::getStream(stream)));
             }
 
+            cvtFromYuv(decodedFrame, frame, videoDecoder_->format().surfaceFormat, videoDecoder_->format().videoFullRangeFlag, stream);
             // unmap video frame
             // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
             videoDecoder_->unmapFrame(decodedFrame);
@@ -350,23 +316,21 @@ namespace
     }
 
     bool ValidColorFormat(const ColorFormat colorFormat) {
-        if (colorFormat == ColorFormat::BGRA || colorFormat == ColorFormat::BGR || colorFormat == ColorFormat::GRAY || colorFormat == ColorFormat::NV_NV12)
+        if (colorFormat == ColorFormat::BGRA || colorFormat == ColorFormat::BGR || colorFormat == ColorFormat::RGB || colorFormat == ColorFormat::RGBA || colorFormat == ColorFormat::GRAY || colorFormat == ColorFormat::NV_YUV_SURFACE_FORMAT || colorFormat == ColorFormat::NV_YUV444)
             return true;
         return false;
     }
 
-    bool VideoReaderImpl::set(const ColorFormat colorFormat_) {
-        if (!ValidColorFormat(colorFormat_)) return false;
-        if (colorFormat_ == ColorFormat::BGR) {
-#if (CUDART_VERSION < 9020)
-            CV_LOG_DEBUG(NULL, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
-            return false;
-#elif (CUDART_VERSION < 11000)
-            if (!videoDecoder_->format().videoFullRangeFlag)
-                CV_LOG_INFO(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
-#endif
+    bool VideoReaderImpl::set(const ColorFormat colorFormat_, const BitDepth bitDepth_, const bool planar_) {
+        ColorFormat tmpFormat = colorFormat_;
+        if (tmpFormat == ColorFormat::NV_NV12) {
+            CV_LOG_WARNING(NULL, "ColorFormat::NV_NV12 is depreciated forcing ColorFormat::NV_YUV_SURFACE_FORMAT instead.");
+            tmpFormat = ColorFormat::NV_YUV_SURFACE_FORMAT;
         }
-        colorFormat = colorFormat_;
+        if (!ValidColorFormat(tmpFormat)) return false;
+        colorFormat = tmpFormat;
+        bitDepth = bitDepth_;
+        planar = planar_;
         return true;
     }
 
@@ -410,6 +374,12 @@ namespace
         case VideoReaderProps::PROP_COLOR_FORMAT:
             propertyVal = static_cast<double>(colorFormat);
             return true;
+        case VideoReaderProps::PROP_BIT_DEPTH:
+            propertyVal = static_cast<double>(bitDepth);
+            return true;
+        case VideoReaderProps::PROP_PLANAR:
+            propertyVal = static_cast<double>(planar);
+            return true;
         default:
             break;
         }
@@ -443,6 +413,15 @@ namespace
             return false;
         return true;
     }
+
+    void VideoReaderImpl::cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, const bool videoFullRangeFlag, Stream& stream)
+    {
+        if (colorFormat == ColorFormat::NV_YUV_SURFACE_FORMAT) {
+            decodedFrame.copyTo(outFrame, stream);
+            return;
+        }
+        yuvConverter->convert(decodedFrame, outFrame, surfaceFormat, colorFormat, bitDepth, planar, videoFullRangeFlag, stream);
+    }
 }
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const std::vector<int>& sourceParams, const VideoReaderInitParams params)
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 003fbb7358e..1158a3f6201 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -45,6 +45,10 @@ namespace opencv_test {
     namespace {
 
 #if defined(HAVE_NVCUVID) || defined(HAVE_NVCUVENC)
+CV_ENUM(ColorFormats, cudacodec::ColorFormat::BGR, cudacodec::ColorFormat::BGRA, cudacodec::ColorFormat::RGB, cudacodec::ColorFormat::RGBA, cudacodec::ColorFormat::GRAY)
+CV_ENUM(SurfaceFormats, cudacodec::SurfaceFormat::SF_NV12, cudacodec::SurfaceFormat::SF_P016, cudacodec::SurfaceFormat::SF_YUV444, cudacodec::SurfaceFormat::SF_YUV444_16Bit)
+CV_ENUM(BitDepths, cudacodec::BitDepth::UNCHANGED, cudacodec::BitDepth::EIGHT, cudacodec::BitDepth::SIXTEEN)
+
 struct SetDevice : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
     cv::cuda::DeviceInfo devInfo;
@@ -76,7 +80,19 @@ PARAM_TEST_CASE(Video, cv::cuda::DeviceInfo, std::string)
 };
 
 typedef tuple<std::string, bool> color_conversion_params_t;
-PARAM_TEST_CASE(ColorConversion, cv::cuda::DeviceInfo, cv::cudacodec::ColorFormat, color_conversion_params_t)
+PARAM_TEST_CASE(ColorConversionLumaChromaRange, cv::cuda::DeviceInfo, color_conversion_params_t)
+{
+};
+
+PARAM_TEST_CASE(ColorConversionFormat, cv::cuda::DeviceInfo, ColorFormats)
+{
+};
+
+struct ColorConversionPlanar : SetDevice
+{
+};
+
+PARAM_TEST_CASE(ColorConversionBitdepth, cv::cuda::DeviceInfo, BitDepths)
 {
 };
 
@@ -117,6 +133,10 @@ struct Seek : SetDevice
 {
 };
 
+PARAM_TEST_CASE(YuvConverter, cv::cuda::DeviceInfo, SurfaceFormats, ColorFormats, BitDepths, bool, bool)
+{
+};
+
 #if defined(HAVE_NVCUVID)
 //////////////////////////////////////////////////////
 // VideoReader
@@ -300,38 +320,35 @@ CUDA_TEST_P(Video, Reader)
         {cudacodec::ColorFormat::GRAY,1},
         {cudacodec::ColorFormat::BGR,3},
         {cudacodec::ColorFormat::BGRA,4},
-        {cudacodec::ColorFormat::NV_NV12,1}
+        {cudacodec::ColorFormat::NV_YUV_SURFACE_FORMAT,1}
     };
 
     std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + relativeFilePath;
     cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
-    ASSERT_FALSE(reader->set(cudacodec::ColorFormat::RGB));
     cv::cudacodec::FormatInfo fmt = reader->format();
     cv::cuda::GpuMat frame;
     for (int i = 0; i < 10; i++)
     {
-        // request a different colour format for each frame
         const std::pair< cudacodec::ColorFormat, int>& formatToChannels = formatsToChannels[i % formatsToChannels.size()];
         ASSERT_TRUE(reader->set(formatToChannels.first));
         double colorFormat;
         ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormat) && static_cast<cudacodec::ColorFormat>(colorFormat) == formatToChannels.first);
         ASSERT_TRUE(reader->nextFrame(frame));
-        const int height = formatToChannels.first == cudacodec::ColorFormat::NV_NV12 ? static_cast<int>(1.5 * fmt.height) : fmt.height;
+        const int height = formatToChannels.first == cudacodec::ColorFormat::NV_YUV_SURFACE_FORMAT ? static_cast<int>(1.5 * fmt.height) : fmt.height;
         ASSERT_TRUE(frame.cols == fmt.width && frame.rows == height);
         ASSERT_FALSE(frame.empty());
         ASSERT_TRUE(frame.channels() == formatToChannels.second);
     }
 }
 
-CUDA_TEST_P(ColorConversion, Reader)
+CUDA_TEST_P(ColorConversionLumaChromaRange, Reader)
 {
     cv::cuda::setDevice(GET_PARAM(0).deviceID());
-    const cv::cudacodec::ColorFormat colorFormat = GET_PARAM(1);
-    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(2));
-    const bool videoFullRangeFlag = get<1>(GET_PARAM(2));
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(1));
+    const bool videoFullRangeFlag = get<1>(GET_PARAM(1));
     cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
     cv::cudacodec::FormatInfo fmt = reader->format();
-    reader->set(colorFormat);
+    reader->set(cudacodec::ColorFormat::BGR);
     cv::VideoCapture cap(inputFile);
 
     cv::cuda::GpuMat frame;
@@ -343,11 +360,120 @@ CUDA_TEST_P(ColorConversion, Reader)
         cap.read(frameHost);
         fmt = reader->format();
         ASSERT_TRUE(fmt.videoFullRangeFlag == videoFullRangeFlag);
-        if (colorFormat == cv::cudacodec::ColorFormat::BGRA)
-            cv::cvtColor(frameHost, frameHostGs, COLOR_BGR2BGRA);
-        else
+        frameHostGs = frameHost;
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2);
+    }
+}
+
+CUDA_TEST_P(ColorConversionFormat, Reader)
+{
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const cudacodec::ColorFormat colorFormat = static_cast<cudacodec::ColorFormat>(static_cast<int>(GET_PARAM(1)));
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    double colorFormatGetVal;
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormatGetVal));
+    ASSERT_EQ(cudacodec::ColorFormat::BGRA, static_cast<cudacodec::ColorFormat>(colorFormatGetVal));
+    reader->set(colorFormat);
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormatGetVal));
+    ASSERT_EQ(colorFormat, static_cast<cudacodec::ColorFormat>(colorFormatGetVal));
+    cv::VideoCapture cap(inputFile);
+
+    int maxDiff = 2;
+    cv::cuda::GpuMat frame;
+    Mat frameHost, frameHostGs, frameFromDevice, unused;
+    for (int i = 0; i < 10; i++)
+    {
+        reader->nextFrame(frame);
+        frame.download(frameFromDevice);
+        cap.read(frameHost);
+        switch (colorFormat)
+        {
+        case cudacodec::ColorFormat::BGRA:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2BGRA);
+            break;
+        case cudacodec::ColorFormat::RGB:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2RGB);
+            break;
+        case cudacodec::ColorFormat::RGBA:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2RGBA);
+            break;
+        case cudacodec::ColorFormat::GRAY:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2GRAY);
+            // Increased error because of different conversion pipelines. i.e. frameFromDevice (NV12 -> GRAY) and frameHostGs (NV12 -> BGR -> GRAY).  Due to 420 subsampling NV12 -> BGR can increase the luminance of neighbouring pixels if they are significantly different to each other meaning the subsequent conversion BGR -> GRAY will be different to the direct NV12 -> GRAY conversion.
+            maxDiff = 15;
+            break;
+        default:
             frameHostGs = frameHost;
-        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2.0);
+        }
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, maxDiff);
+    }
+}
+
+CUDA_TEST_P(ColorConversionPlanar, Reader)
+{
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    double planarGetVal;
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_PLANAR, planarGetVal));
+    ASSERT_FALSE(static_cast<bool>(planarGetVal));
+    reader->set(cudacodec::ColorFormat::BGR, cudacodec::BitDepth::UNCHANGED, true);
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_PLANAR, planarGetVal));
+    ASSERT_TRUE(static_cast<bool>(planarGetVal));
+    cv::VideoCapture cap(inputFile);
+
+    cv::cuda::GpuMat frame;
+    Mat frameHost, frameHostGs, frameFromDevice;
+    for (int i = 0; i < 10; i++)
+    {
+        reader->nextFrame(frame);
+        frame.download(frameFromDevice);
+        cap.read(frameHost);
+        Mat bgrSplit[3];
+        cv::split(frameHost, bgrSplit);
+        if(i == 0)
+            frameHostGs = Mat(frameHost.rows * 3, frameHost.cols, CV_8U);
+        bgrSplit[0].copyTo(frameHostGs(Rect(0, 0, frameHost.cols, frameHost.rows)));
+        bgrSplit[1].copyTo(frameHostGs(Rect(0, frameHost.rows, frameHost.cols, frameHost.rows)));
+        bgrSplit[2].copyTo(frameHostGs(Rect(0, 2 * frameHost.rows, frameHost.cols, frameHost.rows)));
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2);
+    }
+}
+
+CUDA_TEST_P(ColorConversionBitdepth, Reader)
+{
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const cudacodec::BitDepth bitDepth = static_cast<cudacodec::BitDepth>(static_cast<int>(GET_PARAM(1)));
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    double bitDepthGetVal;
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_BIT_DEPTH, bitDepthGetVal));
+    ASSERT_EQ(cudacodec::BitDepth::UNCHANGED, static_cast<cudacodec::BitDepth>(bitDepthGetVal));
+    reader->set(cudacodec::ColorFormat::BGR, bitDepth);
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_BIT_DEPTH, bitDepthGetVal));
+    ASSERT_EQ(bitDepth, static_cast<cudacodec::BitDepth>(bitDepthGetVal));
+    cv::VideoCapture cap(inputFile);
+
+    int maxDiff = 2;
+    cv::cuda::GpuMat frame;
+    Mat frameHost, frameHostGs, frameFromDevice;
+    for (int i = 0; i < 10; i++)
+    {
+        reader->nextFrame(frame);
+        frame.download(frameFromDevice);
+        cap.read(frameHost);
+        switch (bitDepth)
+        {
+        case cudacodec::BitDepth::EIGHT:
+        default:
+            frameHostGs = frameHost;
+            break;
+        case cudacodec::BitDepth::SIXTEEN:
+            frameHost.convertTo(frameHostGs, CV_16U);
+            frameHostGs *= pow(2, 8);
+            maxDiff = 512;
+        }
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, maxDiff);
     }
 }
 
@@ -637,24 +763,259 @@ CUDA_TEST_P(Seek, Reader)
     ASSERT_EQ(iFrame, static_cast<double>(firstFrameIdx+1));
 }
 
+
+void inline GetConstants(float& wr, float& wb, int& black, int& white, int& uvWhite, int& max, bool fullRange = false) {
+    if (fullRange) {
+        black = 0; white = 255; uvWhite = 255;
+    }
+    else {
+        black = 16; white = 235; uvWhite = 240;
+    }
+    max = 255;
+    wr = 0.2990f; wb = 0.1140f;
+}
+
+std::array<std::array<float, 3>, 3> getYuv2RgbMatrix(const bool fullRange = false) {
+    float wr, wb;
+    int black, white, uvWhite, max;
+    GetConstants(wr, wb, black, white, uvWhite, max, fullRange);
+    std::array<std::array<float, 3>, 3> mat = { {
+        {1.0f, 0.0f, (1.0f - wr) / 0.5f},
+        {1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr)},
+        {1.0f, (1.0f - wb) / 0.5f, 0.0f},
+    } };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            if (j == 0)
+                mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
+            else
+                mat[i][j] = (float)(1.0 * max / (uvWhite - black) * mat[i][j]);
+        }
+    }
+    return mat;
+}
+
+std::array<std::array<float, 3>, 3> getRgb2YuvMatrix(const bool fullRange = false) {
+    float wr, wb;
+    int black, white, max, uvWhite;
+    GetConstants(wr, wb, black, white, uvWhite, max, fullRange);
+    std::array<std::array<float, 3>, 3> mat = { {
+        {wr, 1.0f - wb - wr, wb},
+        {-0.5f * wr / (1.0f - wb), -0.5f * (1 - wb - wr) / (1.0f - wb), 0.5f},
+        {0.5f, -0.5f * (1.0f - wb - wr) / (1.0f - wr), -0.5f * wb / (1.0f - wr)},
+    } };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            if (j == 0)
+                mat[i][j] = (float)(1.0 * (white - black) / max * mat[i][j]);
+            else
+                mat[i][j] = (float)(1.0 * (uvWhite - black) / max * mat[i][j]);
+        }
+    }
+    return mat;
+}
+
+void generateGray(Mat bgr, Mat& y, Mat& grayFromY, const bool fullRange) {
+    Mat yuvI420;
+    cv::cvtColor(bgr, yuvI420, COLOR_BGR2YUV_I420);
+    yuvI420(Rect(0, 0, bgr.cols, bgr.rows)).copyTo(y);
+    if (fullRange) {
+        y -= 16;
+        y *= 255.0 / 219.0;
+    }
+    y.copyTo(grayFromY);
+    if (!fullRange) {
+        grayFromY -= 16;
+        grayFromY *= 255.0 / 219.0;
+    }
+}
+
+void generateNv12(Mat bgr, Mat& nv12Interleaved, Mat& bgrFromYuv, const bool fullRange) {
+    Mat yuvI420;
+    cv::cvtColor(bgr, yuvI420, COLOR_BGR2YUV_I420);
+    cv::cvtColor(yuvI420, bgrFromYuv, COLOR_YUV2BGR_I420);
+
+    Mat uv = yuvI420(Rect(0, bgr.rows, bgr.cols, bgr.rows / 2));
+    Mat u0 = uv(Rect(0, 0, uv.cols / 2, uv.rows / 2));
+    Mat u1 = uv(Rect(uv.cols / 2, 0, uv.cols / 2, uv.rows / 2));
+    Mat v0 = uv(Rect(0, uv.rows / 2, uv.cols / 2, uv.rows / 2));
+    Mat v1 = uv(Rect(uv.cols / 2, uv.rows / 2, uv.cols / 2, uv.rows / 2));
+
+    Mat u(uv.rows, uv.cols / 2, CV_8U);
+    Mat ur0(u0.rows, u0.cols, CV_8U, u.data, u0.cols * 2);
+    Mat ur1(u0.rows, u0.cols, CV_8U, u.data + u0.cols, u0.cols * 2);
+    u0.copyTo(ur0);
+    u1.copyTo(ur1);
+
+    Mat v(uv.rows, uv.cols / 2, CV_8U);
+    Mat vr0(v0.rows, v0.cols, CV_8U, v.data, v0.cols * 2);
+    Mat vr1(v0.rows, v0.cols, CV_8U, v.data + v0.cols, v0.cols * 2);
+    v0.copyTo(vr0);
+    v1.copyTo(vr1);
+
+    Mat uv2Channel;
+    Mat uvArray[2] = { u,v };
+    cv::merge(uvArray, 2, uv2Channel);
+
+    Mat y = yuvI420(Rect(0, 0, bgr.cols, bgr.rows));
+    Mat uvInterleaved(uv2Channel.rows, uv2Channel.cols * 2, CV_8U, uv2Channel.data, uv2Channel.step[0]);
+
+    if (fullRange) {
+        Mat y32F;
+        y = (y - 16) * 255.0 / 219.0;
+        uvInterleaved = (uvInterleaved - 128) * 255.0 / 224.0 + 128;
+    }
+
+    nv12Interleaved = Mat(yuvI420.size(), CV_8UC1);
+    y.copyTo(nv12Interleaved(Rect(0, 0, bgr.cols, bgr.rows)));
+    uvInterleaved.copyTo(nv12Interleaved(Rect(0, bgr.rows, uvInterleaved.cols, uvInterleaved.rows)));
+}
+
+void generateYuv444(Mat bgr, Mat& yuv444, Mat& bgrFromYuv, const bool fullRange) {
+    std::array<std::array<float, 3>, 3> matrix = getRgb2YuvMatrix(fullRange);
+    const int yAdj = fullRange ? 0 : 16, uvAdj = 128;
+    Mat bgr32F;
+    bgr.convertTo(bgr32F, CV_32F);
+    Mat bgrSplit32F[3];
+    cv::split(bgr32F, bgrSplit32F);
+    Mat yuv32 = Mat(bgr.rows * 3, bgr.cols, CV_32F);
+    Mat Y = matrix[0][0] * bgrSplit32F[2] + matrix[0][1] * bgrSplit32F[1] + matrix[0][2] * bgrSplit32F[0] + yAdj;
+    Y.copyTo(yuv32(Rect(0, 0, bgr.cols, bgr.rows)));
+    Mat U = matrix[1][0] * bgrSplit32F[2] + matrix[1][1] * bgrSplit32F[1] + matrix[1][2] * bgrSplit32F[0] + uvAdj;
+    U.copyTo(yuv32(Rect(0, bgr.rows, bgr.cols, bgr.rows)));
+    Mat V = matrix[2][0] * bgrSplit32F[2] + matrix[2][1] * bgrSplit32F[1] + matrix[2][2] * bgrSplit32F[0] + uvAdj;
+    V.copyTo(yuv32(Rect(0, 2 * bgr.rows, bgr.cols, bgr.rows)));
+    yuv32.convertTo(yuv444, CV_8UC1);
+
+    Mat y8 = yuv444(Rect(0, 0, bgr.cols, bgr.rows));
+    Mat u8 = yuv444(Rect(0, bgr.rows, bgr.cols, bgr.rows));
+    Mat v8 = yuv444(Rect(0, 2 * bgr.rows, bgr.cols, bgr.rows));
+    y8.convertTo(Y, CV_32F);
+    u8.convertTo(U, CV_32F);
+    v8.convertTo(V, CV_32F);
+
+    if (!fullRange) Y -= 16;
+    U -= 128;
+    V -= 128;
+    matrix = getYuv2RgbMatrix(fullRange);
+    Mat bgrFromYuvSplit32F[3];
+    bgrFromYuvSplit32F[0] = matrix[2][0] * Y + matrix[2][1] * U;
+    bgrFromYuvSplit32F[1] = matrix[1][0] * Y + matrix[1][1] * U + matrix[1][2] * V;
+    bgrFromYuvSplit32F[2] = matrix[0][0] * Y + matrix[0][2] * V;
+    Mat bgrFromYuv32F;
+    cv::merge(bgrFromYuvSplit32F, 3, bgrFromYuv32F);
+    bgrFromYuv32F.convertTo(bgrFromYuv, CV_8UC3);
+}
+
+void generateTestImages(Mat bgrIn, Mat& testImg, Mat& out, const cudacodec::SurfaceFormat inputFormat, const cudacodec::ColorFormat outputFormat, const cudacodec::BitDepth outputBitDepth = cudacodec::BitDepth::EIGHT, bool planar = false, const bool fullRange = false) {
+    Mat imgOutFromYuv, imgOut8;
+    Mat yuv8;
+
+    switch (inputFormat) {
+    case cudacodec::SurfaceFormat::SF_NV12:
+    case cudacodec::SurfaceFormat::SF_P016:
+        if (outputFormat == cudacodec::ColorFormat::GRAY) {
+            yuv8 = Mat(static_cast<int>(bgrIn.rows * 1.5), bgrIn.cols, CV_8U);
+            Mat y = yuv8(Rect(0, 0, bgrIn.cols, bgrIn.rows));
+            generateGray(bgrIn, y, imgOutFromYuv, fullRange);
+        }
+        else
+            generateNv12(bgrIn, yuv8, imgOutFromYuv, fullRange);
+        break;
+    case cudacodec::SurfaceFormat::SF_YUV444:
+    case cudacodec::SurfaceFormat::SF_YUV444_16Bit:
+        if (outputFormat == cudacodec::ColorFormat::GRAY) {
+            yuv8 = Mat(bgrIn.rows * 3, bgrIn.cols, CV_8U);
+            Mat y = yuv8(Rect(0, 0, bgrIn.cols, bgrIn.rows));
+            generateGray(bgrIn, y, imgOutFromYuv, fullRange);
+        }
+        else
+            generateYuv444(bgrIn, yuv8, imgOutFromYuv, fullRange);
+        break;
+    }
+
+    if (inputFormat == cudacodec::SurfaceFormat::SF_P016 || inputFormat == cudacodec::SurfaceFormat::SF_YUV444_16Bit) {
+        yuv8.convertTo(testImg, CV_16U);
+        testImg *= pow(2, 8);
+    }
+    else
+        yuv8.copyTo(testImg);
+
+    switch (outputFormat) {
+    case cudacodec::ColorFormat::BGR:
+        imgOut8 = imgOutFromYuv;
+        break;
+    case cudacodec::ColorFormat::BGRA: {
+        cv::cvtColor(imgOutFromYuv, imgOut8, COLOR_BGR2BGRA);
+        break;
+    }
+    case cudacodec::ColorFormat::RGB: {
+        cv::cvtColor(imgOutFromYuv, imgOut8, COLOR_BGR2RGB);
+        break;
+    }
+    case cudacodec::ColorFormat::RGBA: {
+        cv::cvtColor(imgOutFromYuv, imgOut8, COLOR_BGR2RGBA);
+        break;
+    }
+    case cudacodec::ColorFormat::GRAY: {
+        imgOut8 = imgOutFromYuv;
+        break;
+    }
+    }
+
+    Mat imgOutBitDepthOut;
+    if (outputBitDepth == cudacodec::BitDepth::SIXTEEN) {
+        imgOut8.convertTo(imgOutBitDepthOut, CV_16U);
+        imgOutBitDepthOut *= pow(2, 8);
+    }
+    else
+        imgOutBitDepthOut = imgOut8;
+
+    if (planar && outputFormat != cudacodec::ColorFormat::GRAY) {
+        Mat* bgrSplit = new Mat[imgOutBitDepthOut.channels()];
+        cv::split(imgOutBitDepthOut, bgrSplit);
+        const int type = CV_MAKE_TYPE(CV_MAT_DEPTH(imgOutBitDepthOut.flags), 1);
+        out = Mat(imgOutBitDepthOut.rows * imgOutBitDepthOut.channels(), imgOutBitDepthOut.cols, type);
+        for (int i = 0; i < imgOut8.channels(); i++)
+            bgrSplit[i].copyTo(out(Rect(0, i * imgOut8.rows, imgOut8.cols, imgOut8.rows)));
+        delete[] bgrSplit;
+    }
+    else
+        imgOutBitDepthOut.copyTo(out);
+}
+
+CUDA_TEST_P(YuvConverter, Reader)
+{
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const cudacodec::SurfaceFormat surfaceFormat = static_cast<cudacodec::SurfaceFormat>(static_cast<int>(GET_PARAM(1)));
+    const cudacodec::ColorFormat outputFormat = static_cast<cudacodec::ColorFormat>(static_cast<int>(GET_PARAM(2)));
+    const cudacodec::BitDepth bitDepth = static_cast<cudacodec::BitDepth>(static_cast<int>(GET_PARAM(3)));
+    const bool planar = GET_PARAM(4);
+    const bool fullRange = GET_PARAM(5);
+    std::string imgPath = std::string(cvtest::TS::ptr()->get_data_path()) + "../python/images/baboon.jpg";
+    Ptr<cv::cudacodec::NVSurfaceToColorConverter> yuvConverter = cudacodec::createNVSurfaceToColorConverter(cv::cudacodec::ColorSpaceStandard::BT601, fullRange);
+    Mat bgr = imread(imgPath), bgrHost;
+    Mat nv12Interleaved, bgrFromYuv;
+    generateTestImages(bgr, nv12Interleaved, bgrFromYuv, surfaceFormat, outputFormat, bitDepth, planar, fullRange);
+    GpuMat nv12Device(nv12Interleaved), bgrDevice(bgrFromYuv.size(), bgrFromYuv.type());
+    yuvConverter->convert(nv12Device, bgrDevice, surfaceFormat, outputFormat, bitDepth, planar, fullRange);
+    bgrDevice.download(bgrHost);
+    EXPECT_MAT_NEAR(bgrFromYuv, bgrHost, bitDepth == cudacodec::BitDepth::EIGHT ? 2 :512);
+}
+
 #endif // HAVE_NVCUVID
 
 #if defined(HAVE_NVCUVID) && defined(HAVE_NVCUVENC)
-struct TransCode : testing::TestWithParam<cv::cuda::DeviceInfo>
+
+struct H264ToH265 : SetDevice
 {
-    cv::cuda::DeviceInfo devInfo;
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-        cv::cuda::setDevice(devInfo.deviceID());
-    }
 };
 
 #if defined(WIN32)  // remove when FFmpeg wrapper includes PR25874
 #define WIN32_WAIT_FOR_FFMPEG_WRAPPER_UPDATE
 #endif
 
-CUDA_TEST_P(TransCode, H264ToH265)
+CUDA_TEST_P(H264ToH265, Transcode)
 {
     const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
     constexpr cv::cudacodec::ColorFormat colorFormat = cv::cudacodec::ColorFormat::NV_NV12;
@@ -667,14 +1028,13 @@ CUDA_TEST_P(TransCode, H264ToH265)
     {
         cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
         cv::cudacodec::FormatInfo fmt = reader->format();
-        reader->set(cudacodec::ColorFormat::NV_NV12);
+        reader->set(cudacodec::ColorFormat::NV_YUV_SURFACE_FORMAT);
         cv::Ptr<cv::cudacodec::VideoWriter> writer;
         cv::cuda::GpuMat frame;
         cv::cuda::Stream stream;
         for (int i = 0; i < nFrames; ++i) {
             ASSERT_TRUE(reader->nextFrame(frame, stream));
             ASSERT_FALSE(frame.empty());
-            Mat tst; frame.download(tst);
             if (writer.empty()) {
                 frameSz = Size(fmt.width, fmt.height);
                 writer = cv::cudacodec::createVideoWriter(outputFile, frameSz, codec, fps, colorFormat, 0, stream);
@@ -703,7 +1063,7 @@ CUDA_TEST_P(TransCode, H264ToH265)
     ASSERT_EQ(0, remove(outputFile.c_str()));
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, TransCode, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, H264ToH265, ALL_DEVICES);
 
 #endif
 
@@ -920,12 +1280,16 @@ const color_conversion_params_t color_conversion_params[] =
     color_conversion_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
 };
 
-#define VIDEO_COLOR_OUTPUTS cv::cudacodec::ColorFormat::BGRA, cv::cudacodec::ColorFormat::BGRA
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversion, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionLumaChromaRange, testing::Combine(
     ALL_DEVICES,
-    testing::Values(VIDEO_COLOR_OUTPUTS),
     testing::ValuesIn(color_conversion_params)));
 
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionFormat, testing::Combine(ALL_DEVICES, ColorFormats::all()));
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionPlanar, ALL_DEVICES);
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionBitdepth, testing::Combine(ALL_DEVICES, BitDepths::all()));
+
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, ReconfigureDecoderWithScaling, ALL_DEVICES);
 
 #define N_DECODE_SURFACES testing::Values(0, 10)
@@ -939,7 +1303,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, VideoReadRaw, testing::Combine(
 const histogram_params_t histogram_params[] =
 {
     histogram_params_t("highgui/video/big_buck_bunny.mp4", false),
-    histogram_params_t("highgui/video/big_buck_bunny.h264", true),
+    histogram_params_t("highgui/video/big_buck_bunny.h264", false),
     histogram_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
 };
 
@@ -975,5 +1339,9 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckInitParams, testing::Combine(
 
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, Seek, ALL_DEVICES);
 
+#define BIT_DEPTHS testing::Values(BitDepths(cudacodec::BitDepth::EIGHT), BitDepths(cudacodec::BitDepth::SIXTEEN))
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, YuvConverter, testing::Combine(
+    ALL_DEVICES, SurfaceFormats::all(), ColorFormats::all(), BIT_DEPTHS, testing::Bool(), testing::Bool()));
+
 #endif // HAVE_NVCUVID || HAVE_NVCUVENC
 }} // namespace

From 2c7591c57f6ec8bbc52f58146132ad59ff0fc573 Mon Sep 17 00:00:00 2001
From: sssanjee-quic <quic_sssanjee@quicinc.com>
Date: Mon, 2 Dec 2024 13:23:50 +0530
Subject: [PATCH 02/14] Merge pull request #3824 from
 CodeLinaro:FastcvHAL_1stPost

Depends on https://github.com/opencv/opencv/pull/26316

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/fastcv/CMakeLists.txt                 |  13 ++
 modules/fastcv/README.md                      |   7 +
 modules/fastcv/include/opencv2/fastcv.hpp     |  32 +++
 .../fastcv/include/opencv2/fastcv/arithm.hpp  |  30 +++
 .../opencv2/fastcv/bilateralFilter.hpp        |  41 ++++
 .../fastcv/include/opencv2/fastcv/cluster.hpp |  42 ++++
 .../fastcv/include/opencv2/fastcv/draw.hpp    |  32 +++
 .../fastcv/include/opencv2/fastcv/fast10.hpp  |  42 ++++
 modules/fastcv/include/opencv2/fastcv/fft.hpp |  47 ++++
 .../fastcv/include/opencv2/fastcv/hough.hpp   |  33 +++
 .../fastcv/include/opencv2/fastcv/moments.hpp |  30 +++
 .../fastcv/include/opencv2/fastcv/mser.hpp    | 125 +++++++++++
 .../fastcv/include/opencv2/fastcv/remap.hpp   |  46 ++++
 .../fastcv/include/opencv2/fastcv/scale.hpp   |  36 ++++
 .../fastcv/include/opencv2/fastcv/shift.hpp   |  36 ++++
 .../fastcv/include/opencv2/fastcv/smooth.hpp  |  35 +++
 .../fastcv/include/opencv2/fastcv/thresh.hpp  |  37 ++++
 modules/fastcv/perf/perf_bilateral.cpp        |  67 ++++++
 .../fastcv/perf/perf_cluster_euclidean.cpp    |  79 +++++++
 modules/fastcv/perf/perf_fast10.cpp           |  42 ++++
 modules/fastcv/perf/perf_fill.cpp             |  89 ++++++++
 modules/fastcv/perf/perf_hough.cpp            |  44 ++++
 modules/fastcv/perf/perf_main.cpp             |   8 +
 modules/fastcv/perf/perf_matmul.cpp           |  40 ++++
 modules/fastcv/perf/perf_meanshift.cpp        |  60 ++++++
 modules/fastcv/perf/perf_mser.cpp             |  70 ++++++
 modules/fastcv/perf/perf_precomp.hpp          |  17 ++
 modules/fastcv/perf/perf_threshold_range.cpp  |  48 +++++
 modules/fastcv/src/arithm.cpp                 |  36 ++++
 modules/fastcv/src/bilateralFilter.cpp        | 118 ++++++++++
 modules/fastcv/src/cluster_euclidean.cpp      |  66 ++++++
 modules/fastcv/src/fast10.cpp                 | 120 +++++++++++
 modules/fastcv/src/fft.cpp                    |  72 +++++++
 modules/fastcv/src/fill_poly.cpp              |  36 ++++
 modules/fastcv/src/hough.cpp                  |  35 +++
 modules/fastcv/src/moments.cpp                |  82 +++++++
 modules/fastcv/src/mser.cpp                   | 202 ++++++++++++++++++
 modules/fastcv/src/precomp.hpp                |  75 +++++++
 modules/fastcv/src/remap.cpp                  | 146 +++++++++++++
 modules/fastcv/src/scale.cpp                  |  64 ++++++
 modules/fastcv/src/shift.cpp                  |  56 +++++
 modules/fastcv/src/smooth.cpp                 |  37 ++++
 modules/fastcv/src/thresh.cpp                 |  39 ++++
 modules/fastcv/src/utils.cpp                  |  12 ++
 modules/fastcv/test/test_arithm.cpp           |  56 +++++
 modules/fastcv/test/test_bilateral.cpp        |  38 ++++
 .../fastcv/test/test_cluster_euclidean.cpp    | 124 +++++++++++
 modules/fastcv/test/test_fast10.cpp           |  65 ++++++
 modules/fastcv/test/test_fft.cpp              |  69 ++++++
 modules/fastcv/test/test_fill.cpp             |  85 ++++++++
 modules/fastcv/test/test_hough.cpp            | 105 +++++++++
 modules/fastcv/test/test_main.cpp             |   8 +
 modules/fastcv/test/test_moments.cpp          |  44 ++++
 modules/fastcv/test/test_mser.cpp             | 178 +++++++++++++++
 modules/fastcv/test/test_precomp.hpp          |  10 +
 modules/fastcv/test/test_remap.cpp            | 140 ++++++++++++
 modules/fastcv/test/test_scale.cpp            | 113 ++++++++++
 modules/fastcv/test/test_shift.cpp            |  75 +++++++
 modules/fastcv/test/test_smooth.cpp           |  45 ++++
 modules/fastcv/test/test_thresh.cpp           |  50 +++++
 60 files changed, 3629 insertions(+)
 create mode 100644 modules/fastcv/CMakeLists.txt
 create mode 100644 modules/fastcv/README.md
 create mode 100644 modules/fastcv/include/opencv2/fastcv.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/arithm.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/bilateralFilter.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/cluster.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/draw.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/fast10.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/fft.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/hough.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/moments.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/mser.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/remap.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/scale.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/shift.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/smooth.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/thresh.hpp
 create mode 100644 modules/fastcv/perf/perf_bilateral.cpp
 create mode 100644 modules/fastcv/perf/perf_cluster_euclidean.cpp
 create mode 100644 modules/fastcv/perf/perf_fast10.cpp
 create mode 100644 modules/fastcv/perf/perf_fill.cpp
 create mode 100644 modules/fastcv/perf/perf_hough.cpp
 create mode 100644 modules/fastcv/perf/perf_main.cpp
 create mode 100644 modules/fastcv/perf/perf_matmul.cpp
 create mode 100644 modules/fastcv/perf/perf_meanshift.cpp
 create mode 100644 modules/fastcv/perf/perf_mser.cpp
 create mode 100644 modules/fastcv/perf/perf_precomp.hpp
 create mode 100644 modules/fastcv/perf/perf_threshold_range.cpp
 create mode 100644 modules/fastcv/src/arithm.cpp
 create mode 100644 modules/fastcv/src/bilateralFilter.cpp
 create mode 100644 modules/fastcv/src/cluster_euclidean.cpp
 create mode 100644 modules/fastcv/src/fast10.cpp
 create mode 100644 modules/fastcv/src/fft.cpp
 create mode 100644 modules/fastcv/src/fill_poly.cpp
 create mode 100644 modules/fastcv/src/hough.cpp
 create mode 100644 modules/fastcv/src/moments.cpp
 create mode 100644 modules/fastcv/src/mser.cpp
 create mode 100644 modules/fastcv/src/precomp.hpp
 create mode 100644 modules/fastcv/src/remap.cpp
 create mode 100644 modules/fastcv/src/scale.cpp
 create mode 100644 modules/fastcv/src/shift.cpp
 create mode 100644 modules/fastcv/src/smooth.cpp
 create mode 100644 modules/fastcv/src/thresh.cpp
 create mode 100644 modules/fastcv/src/utils.cpp
 create mode 100644 modules/fastcv/test/test_arithm.cpp
 create mode 100644 modules/fastcv/test/test_bilateral.cpp
 create mode 100644 modules/fastcv/test/test_cluster_euclidean.cpp
 create mode 100644 modules/fastcv/test/test_fast10.cpp
 create mode 100644 modules/fastcv/test/test_fft.cpp
 create mode 100644 modules/fastcv/test/test_fill.cpp
 create mode 100644 modules/fastcv/test/test_hough.cpp
 create mode 100644 modules/fastcv/test/test_main.cpp
 create mode 100644 modules/fastcv/test/test_moments.cpp
 create mode 100644 modules/fastcv/test/test_mser.cpp
 create mode 100644 modules/fastcv/test/test_precomp.hpp
 create mode 100644 modules/fastcv/test/test_remap.cpp
 create mode 100644 modules/fastcv/test/test_scale.cpp
 create mode 100644 modules/fastcv/test/test_shift.cpp
 create mode 100644 modules/fastcv/test/test_smooth.cpp
 create mode 100644 modules/fastcv/test/test_thresh.cpp

diff --git a/modules/fastcv/CMakeLists.txt b/modules/fastcv/CMakeLists.txt
new file mode 100644
index 00000000000..c11bd49ec8b
--- /dev/null
+++ b/modules/fastcv/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(HAVE_FASTCV)
+  set(the_description "Qualcomm FastCV accelerated functions")
+  ocv_define_module(fastcv opencv_core opencv_imgproc opencv_features2d opencv_video WRAP python java)
+  ocv_module_include_directories(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include"
+    ${FastCV_INCLUDE_PATH})
+
+  ocv_target_link_libraries(${the_module} ${FASTCV_LIBRARY})
+  ocv_target_compile_definitions(${the_module} PRIVATE -DHAVE_FASTCV=1)
+  ocv_install_3rdparty_licenses(FastCV "${OpenCV_BINARY_DIR}/3rdparty/fastcv/LICENSE")
+else()
+  ocv_module_disable(fastcv)
+endif()
diff --git a/modules/fastcv/README.md b/modules/fastcv/README.md
new file mode 100644
index 00000000000..0c7323c086c
--- /dev/null
+++ b/modules/fastcv/README.md
@@ -0,0 +1,7 @@
+FastCV extension for OpenCV
+===========================
+
+This module provides wrappers for several FastCV functions not covered by the corresponding HAL in OpenCV or have implementation incompatible with OpenCV.
+Please note that:
+1. This module supports ARM architecture only. This means that CMake script aborts configuration under x86 platform even if you don't want to build binaries for your machine and just want to build docs or enable code analysis in your IDE. In that case you should fix CMakeLists.txt file as told inside it.
+2. Test data is stored in misc folder. Before running tests on a device you should copy the content of `misc/` folder to `$YOUR_TESTDATA_PATH/fastcv/` folder on a device.
diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
new file mode 100644
index 00000000000..4248a674076
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HPP
+#define OPENCV_FASTCV_HPP
+
+#include <opencv2/core.hpp>
+
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ */
+
+#include "opencv2/fastcv/arithm.hpp"
+#include "opencv2/fastcv/bilateralFilter.hpp"
+#include "opencv2/fastcv/cluster.hpp"
+#include "opencv2/fastcv/draw.hpp"
+#include "opencv2/fastcv/fast10.hpp"
+#include "opencv2/fastcv/fft.hpp"
+#include "opencv2/fastcv/hough.hpp"
+#include "opencv2/fastcv/moments.hpp"
+#include "opencv2/fastcv/mser.hpp"
+#include "opencv2/fastcv/remap.hpp"
+#include "opencv2/fastcv/scale.hpp"
+#include "opencv2/fastcv/shift.hpp"
+#include "opencv2/fastcv/smooth.hpp"
+#include "opencv2/fastcv/thresh.hpp"
+
+//! @}
+
+#endif // OPENCV_FASTCV_ARITHM_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/arithm.hpp b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
new file mode 100644
index 00000000000..e90079946be
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_ARITHM_HPP
+#define OPENCV_FASTCV_ARITHM_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Matrix multiplication of two int8_t type matrices
+
+ * @param src1 First source matrix of type CV_8S
+ * @param src2 Second source matrix of type CV_8S
+ * @param dst Resulting matrix of type CV_32S
+ */
+CV_EXPORTS_W void matmuls8s32(InputArray src1, InputArray src2, OutputArray dst);
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_ARITHM_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/bilateralFilter.hpp b/modules/fastcv/include/opencv2/fastcv/bilateralFilter.hpp
new file mode 100644
index 00000000000..3210e99f944
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/bilateralFilter.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_BILATERALFILTER_HPP
+#define OPENCV_FASTCV_BILATERALFILTER_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Applies Bilateral filter to an image considering d-pixel diameter of each pixel's neighborhood.
+          This filter does not work inplace.
+
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Destination image with same type as _src
+ * @param d kernel size (can be 5, 7 or 9)
+ * @param sigmaColor Filter sigma in the color space.
+                     Typical value is 50.0f.
+                     Increasing this value means increasing the influence of the neighboring pixels of more different color to the smoothing result.
+ * @param sigmaSpace Filter sigma in the coordinate space.
+                     Typical value is 1.0f.
+                     Increasing this value means increasing the influence of farther neighboring pixels within the kernel size distance to the smoothing result.
+ * @param borderType border mode used to extrapolate pixels outside of the image
+ */
+CV_EXPORTS_W void bilateralFilter( InputArray _src, OutputArray _dst, int d,
+                      float sigmaColor, float sigmaSpace,
+                      int borderType = BORDER_DEFAULT );
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_BILATERALFILTER_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/cluster.hpp b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
new file mode 100644
index 00000000000..f90deeae465
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_CLUSTER_HPP
+#define OPENCV_FASTCV_CLUSTER_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Clusterizes N input points in D-dimensional space into K clusters
+ * 
+ * @param points            Points array of type 8u, each row represets a point.
+ *                          Size is N rows by D columns, can be non-continuous.
+ * @param clusterCenters    Initial cluster centers array of type 32f, each row represents a center.
+ *                          Size is K rows by D columns, can be non-continuous.
+ * @param newClusterCenters Resulting cluster centers array of type 32f, each row represents found center.
+ *                          Size is set to be K rows by D columns.
+ * @param clusterSizes      Resulting cluster member counts array of type uint32, size is set to be 1 row by K columns.
+ * @param clusterBindings   Resulting points indices array of type uint32, each index tells to which cluster the corresponding point belongs to.
+ *                          Size is set to be 1 row by numPointsUsed columns.
+ * @param clusterSumDists   Resulting distance sums array of type 32f, each number is a sum of distances between each cluster center to its belonging points.
+ *                          Size is set to be 1 row by K columns
+ * @param numPointsUsed     Number of points to clusterize starting from 0 to numPointsUsed-1 inclusively. Sets to N if negative.
+ */
+CV_EXPORTS_W void clusterEuclidean(InputArray points, InputArray clusterCenters, OutputArray newClusterCenters,
+                                   OutputArray clusterSizes, OutputArray clusterBindings, OutputArray clusterSumDists,
+                                   int numPointsUsed = -1);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_CLUSTER_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/draw.hpp b/modules/fastcv/include/opencv2/fastcv/draw.hpp
new file mode 100644
index 00000000000..baa2b58c930
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/draw.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_DRAW_HPP
+#define OPENCV_FASTCV_DRAW_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Draw convex polygon
+          This function fills the interior of a convex polygon with the specified color.
+
+ * @param img Image to draw on. Should have up to 4 8-bit channels
+ * @param pts Array of polygon points coordinates. Should contain N two-channel or 2*N one-channel 32-bit integer elements
+ * @param color Color of drawn polygon stored as B,G,R and A(if supported)
+ */
+CV_EXPORTS_W void fillConvexPoly(InputOutputArray img, InputArray pts, Scalar color);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_DRAW_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/fast10.hpp b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
new file mode 100644
index 00000000000..1d97e9d0df7
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_FAST10_HPP
+#define OPENCV_FASTCV_FAST10_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Extracts FAST corners and scores from the image based on the mask.
+          The mask specifies pixels to be ignored by the detector
+
+ * @param src 8-bit grayscale image
+ * @param mask Optional mask indicating which pixels should be omited from corner dection.
+               Its size should be k times image width and height, where k = 1/2, 1/4 , 1/8 , 1, 2, 4 and 8
+               For more details see documentation to `fcvCornerFast9InMaskScoreu8` function in FastCV
+ * @param coords Output array of CV_32S containing interleave x, y positions of detected corners
+ * @param scores Optional output array containing the scores of the detected corners.
+                 The score is the highest threshold that can still validate the detected corner.
+                 A higher score value indicates a stronger corner feature.
+                 For example, a corner of score 108 is stronger than a corner of score 50
+ * @param barrier FAST threshold. The threshold is used to compare difference between intensity value
+                  of the central pixel and pixels on a circle surrounding this pixel
+ * @param border Number for pixels to ignore from top,bottom,right,left of the image. Defaults to 4 if it's below 4
+ * @param nmsEnabled Enable non-maximum suppresion to prune weak key points
+ */
+CV_EXPORTS_W void FAST10(InputArray src, InputArray mask, OutputArray coords, OutputArray scores, int barrier, int border, bool nmsEnabled);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_FAST10_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/fft.hpp b/modules/fastcv/include/opencv2/fastcv/fft.hpp
new file mode 100644
index 00000000000..88901a6a4f8
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/fft.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_FFT_HPP
+#define OPENCV_FASTCV_FFT_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Computes the 1D or 2D Fast Fourier Transform of a real valued matrix.
+          For the 2D case, the width and height of the input and output matrix must be powers of 2.
+          For the 1D case, the height of the matrices must be 1, while the width must be a power of 2.
+
+ * @param src Input array of CV_8UC1. The dimensions of the matrix must be powers of 2 for the 2D case,
+              and in the 1D case, the height must be 1, while the width must be a power of 2.
+ * @param dst The computed FFT matrix of type CV_32FC2. The FFT Re and Im coefficients are stored in different channels.
+              Hence the dimensions of the dst are (srcWidth, srcHeight)
+ */
+CV_EXPORTS_W void FFT(InputArray src, OutputArray dst);
+
+/**
+ * @brief Computes the 1D or 2D Inverse Fast Fourier Transform of a complex valued matrix.
+          For the 2D case, The width and height of the input and output matrix must be powers of 2.
+          For the 1D case, the height of the matrices must be 1, while the width must be a power of 2.
+
+ * @param src Input array of type CV_32FC2 containing FFT Re and Im coefficients stored in separate channels.
+              The dimensions of the matrix must be powers of 2 for the 2D case, and in the 1D case, the height must be 1,
+              while the width must be a power of 2.
+ * @param dst The computed IFFT matrix of type CV_8U. The matrix is real valued and has no imaginary components.
+              Hence the dimensions of the dst are (srcWidth , srcHeight)
+ */
+CV_EXPORTS_W void IFFT(InputArray src, OutputArray dst);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_FFT_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/hough.hpp b/modules/fastcv/include/opencv2/fastcv/hough.hpp
new file mode 100644
index 00000000000..74f78a10841
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/hough.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HOUGH_HPP
+#define OPENCV_FASTCV_HOUGH_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Performs Hough Line detection
+ * 
+ * @param src Input 8-bit image containing binary contour. Width and step should be divisible by 8
+ * @param lines Output array containing detected lines in a form of (x1, y1, x2, y2) where all numbers are 32-bit floats
+ * @param threshold Controls the minimal length of a detected line. Value must be between 0.0 and 1.0
+ *                  Values close to 1.0 reduces the number of detected lines. Values close to 0.0
+ *                  detect more lines, but may be noisy. Recommended value is 0.25.
+ */
+CV_EXPORTS_W void houghLines(InputArray src, OutputArray lines, double threshold = 0.25);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_HOUGH_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/moments.hpp b/modules/fastcv/include/opencv2/fastcv/moments.hpp
new file mode 100644
index 00000000000..3cffa62f767
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/moments.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_MOMENTS_HPP
+#define OPENCV_FASTCV_MOMENTS_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Calculates all of the moments up to the third order of the image pixels' intensities
+          The results are returned in the structure cv::Moments.
+ * @param _src Input image with type CV_8UC1, CV_32SC1, CV_32FC1
+ * @param binary If 1, binary image (0x00-black, oxff-white); if 0, grayscale image
+ */
+CV_EXPORTS cv::Moments moments(InputArray _src, bool binary);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_MOMENTS_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/mser.hpp b/modules/fastcv/include/opencv2/fastcv/mser.hpp
new file mode 100644
index 00000000000..78282b66fdd
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/mser.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_MSER_HPP
+#define OPENCV_FASTCV_MSER_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Structure containing additional information about found contour
+ *
+ */
+struct ContourData
+{
+    uint32_t variation;   //!< Variation of a contour from previous grey level
+    int32_t  polarity;    //!< Polarity for a contour. This value is 1 if this is a MSER+ region, -1 if this is a MSER- region.
+    uint32_t nodeId;      //!< Node ID for a contour
+    uint32_t nodeCounter; //!< Node counter for a contour
+};
+
+/**
+ * @brief This is an overload for MSER() function
+ *
+ * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+              Pixels at the image boundary are not processed. If boundary pixels are important
+              for a particular application, please consider padding the input image with dummy
+              pixels of one pixel wide.
+ * @param contours Array containing found contours
+ * @param numNeighbors Number of neighbors in contours, can be 4 or 8
+ * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
+                within which the region is stable ).
+                Typical value range [0.8 8], typical value 2
+ * @param minArea Minimum area (number of pixels) of a mser contour.
+                Typical value range [10 50], typical value 30
+ * @param maxArea Maximum area (number of pixels) of a  mser contour.
+                Typical value 14400 or 0.25*width*height
+ * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
+                Typical value range [0.1 1.0], typical value 0.15
+ * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
+                Typical value range [0.1 1.0], typical value 0.2
+ */
+CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours,
+                       unsigned int numNeighbors = 4,
+                       unsigned int delta = 2,
+                       unsigned int minArea = 30,
+                       unsigned int maxArea = 14400,
+                       float        maxVariation = 0.15f,
+                       float        minDiversity = 0.2f);
+
+/**
+ * @brief This is an overload for MSER() function
+ *
+ * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+              Pixels at the image boundary are not processed. If boundary pixels are important
+              for a particular application, please consider padding the input image with dummy
+              pixels of one pixel wide.
+ * @param contours Array containing found contours
+ * @param boundingBoxes Array containing bounding boxes of found contours
+ * @param numNeighbors Number of neighbors in contours, can be 4 or 8
+ * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
+                within which the region is stable ).
+                Typical value range [0.8 8], typical value 2
+ * @param minArea Minimum area (number of pixels) of a mser contour.
+                Typical value range [10 50], typical value 30
+ * @param maxArea Maximum area (number of pixels) of a  mser contour.
+                Typical value 14400 or 0.25*width*height
+ * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
+                Typical value range [0.1 1.0], typical value 0.15
+ * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
+                Typical value range [0.1 1.0], typical value 0.2
+ */
+CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                       unsigned int numNeighbors = 4,
+                       unsigned int delta = 2,
+                       unsigned int minArea = 30,
+                       unsigned int maxArea = 14400,
+                       float        maxVariation = 0.15f,
+                       float        minDiversity = 0.2f);
+
+/**
+ * @brief Runs MSER blob detector on the grayscale image
+ *
+ * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+              Pixels at the image boundary are not processed. If boundary pixels are important
+              for a particular application, please consider padding the input image with dummy
+              pixels of one pixel wide.
+ * @param contours Array containing found contours
+ * @param boundingBoxes Array containing bounding boxes of found contours
+ * @param contourData Array containing additional information about found contours
+ * @param numNeighbors Number of neighbors in contours, can be 4 or 8
+ * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
+                within which the region is stable ).
+                Typical value range [0.8 8], typical value 2
+ * @param minArea Minimum area (number of pixels) of a mser contour.
+                Typical value range [10 50], typical value 30
+ * @param maxArea Maximum area (number of pixels) of a  mser contour.
+                Typical value 14400 or 0.25*width*height
+ * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
+                Typical value range [0.1 1.0], typical value 0.15
+ * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
+                Typical value range [0.1 1.0], typical value 0.2
+ */
+CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                       std::vector<ContourData>& contourData,
+                       unsigned int numNeighbors = 4,
+                       unsigned int delta = 2,
+                       unsigned int minArea = 30,
+                       unsigned int maxArea = 14400,
+                       float        maxVariation = 0.15f,
+                       float        minDiversity = 0.2f);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_MSER_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/remap.hpp b/modules/fastcv/include/opencv2/fastcv/remap.hpp
new file mode 100644
index 00000000000..6482fb2b26c
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/remap.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_REMAP_HPP
+#define OPENCV_FASTCV_REMAP_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Applies a generic geometrical transformation to a greyscale CV_8UC1 image.
+ * @param src The first input image data, type CV_8UC1
+ * @param dst The output image data, type CV_8UC1
+ * @param map1 Floating-point CV_32FC1 matrix with each element as the column coordinate of the mapped location in the source image
+ * @param map2 Floating-point CV_32FC1 matrix with each element as the row coordinate of the mapped location in the source image.
+ * @param interpolation Only INTER_NEAREST and INTER_LINEAR interpolation is supported
+ * @param borderValue constant pixel value
+*/
+CV_EXPORTS_W void remap( InputArray src, OutputArray dst,
+                       InputArray map1, InputArray map2,
+                       int interpolation, int borderValue=0);
+
+/**
+ * @brief Applies a generic geometrical transformation to a 4-channel CV_8UC4 image with bilinear or nearest neighbor interpolation
+ * @param src The first input image data, type CV_8UC4
+ * @param dst The output image data, type CV_8UC4
+ * @param map1 Floating-point CV_32FC1 matrix with each element as the column coordinate of the mapped location in the source image
+ * @param map2 Floating-point CV_32FC1 matrix with each element as the row coordinate of the mapped location in the source image.
+ * @param interpolation Only INTER_NEAREST and INTER_LINEAR interpolation is supported
+*/
+CV_EXPORTS_W void remapRGBA( InputArray src, OutputArray dst,
+                             InputArray map1, InputArray map2, int interpolation);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_REMAP_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/scale.hpp b/modules/fastcv/include/opencv2/fastcv/scale.hpp
new file mode 100644
index 00000000000..e499f6f3b7d
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/scale.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_SCALE_HPP
+#define OPENCV_FASTCV_SCALE_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Down-scale the image by averaging each 2x2 pixel block.
+ * @param src The first input image data, type CV_8UC1, src height must be a multiple of 2
+ * @param dst The output image data, type CV_8UC1
+*/
+CV_EXPORTS_W void resizeDownBy2(cv::InputArray _src, cv::OutputArray _dst);
+
+/**
+ * @brief Down-scale the image by averaging each 4x4 pixel block.
+ * @param src The first input image data, type CV_8UC1, src height must be a multiple of 4
+ * @param dst The output image data, type CV_8UC1
+*/
+CV_EXPORTS_W void resizeDownBy4(cv::InputArray _src, cv::OutputArray _dst);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_SCALE_HPP
\ No newline at end of file
diff --git a/modules/fastcv/include/opencv2/fastcv/shift.hpp b/modules/fastcv/include/opencv2/fastcv/shift.hpp
new file mode 100644
index 00000000000..a545789f199
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/shift.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_SHIFT_HPP
+#define OPENCV_FASTCV_SHIFT_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Applies the meanshift procedure and obtains the final converged position.
+          This function applies the meanshift procedure to an original image (usually a probability image)
+          and obtains the final converged position. The converged position search will stop either it has reached
+          the required accuracy or the maximum number of iterations.
+
+ * @param src 8-bit grayscale image which is usually a probability image computed based on object histogram
+ * @param rect Initial search window position which also returns the final converged window position
+ * @param termCrit The criteria used to finish the MeanShift which consists of two termination criteria:
+ *                 1) epsilon: required accuracy; 2) max_iter: maximum number of iterations
+ * @return Iteration number at which the loop stopped
+ */
+CV_EXPORTS_W int meanShift(InputArray src, Rect& rect, TermCriteria termCrit);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_SHIFT_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/smooth.hpp b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
new file mode 100644
index 00000000000..a3cee45a3ce
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_SMOOTH_HPP
+#define OPENCV_FASTCV_SMOOTH_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Recursive Bilateral Filtering
+
+Different from traditional bilateral filtering, here the smoothing is actually performed in gradient domain.
+The algorithm claims that it's more efficient than the original bilateral filtering in both image quality and computation.
+See algorithm description in the paper Recursive Bilateral Filtering, ECCV2012 by Prof Yang Qingxiong
+ * @param src Input image, should have one CV_8U channel
+ * @param dst Output array having one CV_8U channel
+ * @param sigmaColor Sigma in the color space, the bigger the value the more color difference is smoothed by the algorithm
+ * @param sigmaSpace Sigma in the coordinate space, the bigger the value the more distant pixels are smoothed
+ */
+CV_EXPORTS_W void bilateralRecursive(cv::InputArray src, cv::OutputArray dst, float sigmaColor = 0.03f, float sigmaSpace = 0.1f);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_SMOOTH_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/thresh.hpp b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
new file mode 100644
index 00000000000..878761d75d5
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_THRESH_HPP
+#define OPENCV_FASTCV_THRESH_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Binarizes a grayscale image based on a pair of threshold values. The binarized image will be in the two values
+ *        selected by user
+
+ * @param src 8-bit grayscale image
+ * @param dst Output image of the same size and type as input image, can be the same as input image
+ * @param lowThresh The lower threshold value for binarization
+ * @param highThresh The higher threshold value for binarization
+ * @param trueValue The value assigned to the destination pixel if the source is within the range inclusively defined by the
+ *                  pair of threshold values
+ * @param falseValue The value assigned to the destination pixel if the source is out of the range inclusively defined by the
+ *                   pair of threshold values
+ */
+CV_EXPORTS_W void thresholdRange(InputArray src, OutputArray dst, uint8_t lowThresh, uint8_t highThresh, uint8_t trueValue, uint8_t falseValue);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_THRESH_HPP
diff --git a/modules/fastcv/perf/perf_bilateral.cpp b/modules/fastcv/perf/perf_bilateral.cpp
new file mode 100644
index 00000000000..bb985da391d
--- /dev/null
+++ b/modules/fastcv/perf/perf_bilateral.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/> BilateralPerfParams;
+typedef perf::TestBaseWithParam<BilateralPerfParams> BilateralPerfTest;
+
+PERF_TEST_P(BilateralPerfTest, run,
+    ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
+                       ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f))
+           )
+{
+    auto p = GetParam();
+    float sigmaColor = std::get<0>(p);
+    float sigmaSpace = std::get<1>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+    Mat dst;
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::bilateralRecursive(src, dst, sigmaColor, sigmaSpace);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/, cv::Size, int > BilateralPerfParams2;
+typedef perf::TestBaseWithParam<BilateralPerfParams2> BilateralPerfTest2;
+
+
+PERF_TEST_P(BilateralPerfTest2, run,
+    ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
+                       ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f),
+					   ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
+                       ::testing::Values(5, 7, 9))
+           )
+{
+    auto p = GetParam();
+    float sigmaColor = std::get<0>(p);
+    float sigmaSpace = std::get<1>(p);
+    cv::Size size  = std::get<2>(p);
+	int d = get<3>(p);
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+    Mat dst;
+
+    for (;  next(); )
+    {
+        startTimer();
+		cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_cluster_euclidean.cpp b/modules/fastcv/perf/perf_cluster_euclidean.cpp
new file mode 100644
index 00000000000..20bc31a7aa9
--- /dev/null
+++ b/modules/fastcv/perf/perf_cluster_euclidean.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<int /* nPts */, int /*nDims*/, int /*nClusters*/> ClusterEuclideanPerfParams;
+typedef perf::TestBaseWithParam<ClusterEuclideanPerfParams> ClusterEuclideanPerfTest;
+
+PERF_TEST_P(ClusterEuclideanPerfTest, run,
+            ::testing::Combine(::testing::Values(100, 1000, 10000), // nPts
+                               ::testing::Values(2, 10, 32),        // nDims
+                               ::testing::Values(5, 10, 16))        // nClusters
+           )
+{
+    auto p = GetParam();
+    int nPts      = std::get<0>(p);
+    int nDims     = std::get<1>(p);
+    int nClusters = std::get<2>(p);
+
+    Mat points(nPts, nDims, CV_8U);
+    Mat clusterCenters(nClusters, nDims, CV_32F);
+
+    Mat trueMeans(nClusters, nDims, CV_32F);
+    Mat stddevs(nClusters, nDims, CV_32F);
+    std::vector<int> trueClusterSizes(nClusters, 0);
+    std::vector<int> trueClusterBindings(nPts, 0);
+    std::vector<float> trueSumDists(nClusters, 0);
+
+    cv::RNG& rng = cv::theRNG();
+    for (int i = 0; i < nClusters; i++)
+    {
+        Mat mean(1, nDims, CV_64F), stdev(1, nDims, CV_64F);
+        rng.fill(mean,  cv::RNG::UNIFORM, 0, 256);
+        rng.fill(stdev, cv::RNG::UNIFORM, 5.f, 16);
+        int lo =    i    * nPts / nClusters;
+        int hi = (i + 1) * nPts / nClusters;
+
+        for (int d = 0; d < nDims; d++)
+        {
+            rng.fill(points.col(d).rowRange(lo, hi), cv::RNG::NORMAL,
+                     mean.at<double>(d), stdev.at<double>(d));
+        }
+
+        float sd = 0;
+        for (int j = lo; j < hi; j++)
+        {
+            Mat pts64f;
+            points.row(j).convertTo(pts64f, CV_64F);
+            sd += cv::norm(mean, pts64f, NORM_L2);
+            trueClusterBindings.at(j) = i;
+            trueClusterSizes.at(i)++;
+        }
+        trueSumDists.at(i) = sd;
+
+        // let's shift initial cluster center a bit
+        Mat(mean + stdev * 0.5).copyTo(clusterCenters.row(i));
+
+        mean.copyTo(trueMeans.row(i));
+        stdev.copyTo(stddevs.row(i));
+    }
+
+    while(next())
+    {
+        Mat newClusterCenters;
+        std::vector<int> clusterSizes, clusterBindings;
+        std::vector<float> clusterSumDists;
+        startTimer();
+        cv::fastcv::clusterEuclidean(points, clusterCenters, newClusterCenters, clusterSizes, clusterBindings, clusterSumDists);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_fast10.cpp b/modules/fastcv/perf/perf_fast10.cpp
new file mode 100644
index 00000000000..0d9111a88f2
--- /dev/null
+++ b/modules/fastcv/perf/perf_fast10.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<bool /*useScores*/, int /*barrier*/, int /*border*/, bool /*nmsEnabled*/> FAST10PerfParams;
+typedef perf::TestBaseWithParam<FAST10PerfParams> FAST10PerfTest;
+
+PERF_TEST_P(FAST10PerfTest, run,
+::testing::Combine(::testing::Bool(),   // useScores
+                   ::testing::Values(10, 30, 50), // barrier
+                   ::testing::Values( 4, 10, 32), // border
+                   ::testing::Bool() // nonmax suppression
+                  )
+           )
+{
+    auto p = GetParam();
+    bool useScores  = std::get<0>(p);
+    int  barrier    = std::get<1>(p);
+    int  border     = std::get<2>(p);
+    bool nmsEnabled = std::get<3>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<int> coords, scores;
+    while(next())
+    {
+        coords.clear();
+        scores.clear();
+        startTimer();
+        cv::fastcv::FAST10(src, noArray(), coords, useScores ? scores : noArray(), barrier, border, nmsEnabled);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_fill.cpp b/modules/fastcv/perf/perf_fill.cpp
new file mode 100644
index 00000000000..3a2056ef8f3
--- /dev/null
+++ b/modules/fastcv/perf/perf_fill.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef tuple<cv::Size /*imgSize*/, int /*nPts*/, int /*channels*/> FillConvexPerfParams;
+typedef perf::TestBaseWithParam<FillConvexPerfParams> FillConvexPerfTest;
+
+PERF_TEST_P(FillConvexPerfTest, randomDraw, Combine(
+                testing::Values(Size(640, 480), Size(512, 512), Size(1920, 1080)),
+                testing::Values(4, 64, 1024),
+                testing::Values(1, 2, 3, 4)
+            ))
+{
+    auto p = GetParam();
+
+    Size imgSize = std::get<0>(p);
+    int nPts     = std::get<1>(p);
+    int channels = std::get<2>(p);
+
+    cv::RNG rng = cv::theRNG();
+
+    std::vector<Point> allPts, contour;
+    for (int i = 0; i < nPts; i++)
+    {
+        allPts.push_back(Point(rng() % imgSize.width, rng() % imgSize.height));
+    }
+    cv::convexHull(allPts, contour);
+
+    Scalar color(rng() % 256, rng() % 256, rng() % 256);
+
+    Mat img(imgSize, CV_MAKE_TYPE(CV_8U, channels), Scalar(0));
+
+    while(next())
+    {
+        img = 0;
+        startTimer();
+        cv::fastcv::fillConvexPoly(img, contour, color);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(FillConvexPerfTest, circle, Combine(
+                testing::Values(Size(640, 480), Size(512, 512), Size(1920, 1080)),
+                testing::Values(4, 64, 1024),
+                testing::Values(1, 2, 3, 4)
+            ))
+{
+    auto p = GetParam();
+
+    Size imgSize = std::get<0>(p);
+    int nPts     = std::get<1>(p);
+    int channels = std::get<2>(p);
+
+    cv::RNG rng = cv::theRNG();
+
+    float r = std::min(imgSize.width, imgSize.height) / 2 * 0.9f;
+    float angle = CV_PI * 2.0f / (float)nPts;
+    std::vector<Point2i> contour;
+    for (int i = 0; i < nPts; i++)
+    {
+        Point2f pt(r * cos((float)i * angle),
+                   r * sin((float)i * angle));
+        contour.push_back({ imgSize.width  / 2 + int(pt.x),
+                            imgSize.height / 2 + int(pt.y)});
+    }
+    Scalar color(rng() % 256, rng() % 256, rng() % 256);
+
+    Mat img(imgSize, CV_MAKE_TYPE(CV_8U, channels), Scalar(0));
+
+    while(next())
+    {
+        img = 0;
+        startTimer();
+        cv::fastcv::fillConvexPoly(img, contour, color);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_hough.cpp b/modules/fastcv/perf/perf_hough.cpp
new file mode 100644
index 00000000000..78424a696dc
--- /dev/null
+++ b/modules/fastcv/perf/perf_hough.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<std::string /* file name */, double /* threshold */ > HoughLinesPerfParams;
+typedef perf::TestBaseWithParam<HoughLinesPerfParams> HoughLinesPerfTest;
+
+PERF_TEST_P(HoughLinesPerfTest, run,
+                        ::testing::Combine(::testing::Values("cv/shared/pic5.png",
+                                                             "stitching/a1.png",
+                                                             "cv/shared/pic5.png",
+                                                             "cv/shared/pic1.png"), // images
+                                           ::testing::Values(0.05, 0.25, 0.5, 0.75, 5) // threshold
+                                           )
+           )
+{
+    auto p = GetParam();
+    std::string fname = std::get<0>(p);
+    double threshold  = std::get<1>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile(fname), cv::IMREAD_GRAYSCALE);
+    // make it aligned by 8
+    cv::Mat withBorder;
+    int bpix = ((src.cols & 0xfffffff8) + 8) - src.cols;
+    cv::copyMakeBorder(src, withBorder, 0, 0, 0, bpix, BORDER_REFLECT101);
+    src = withBorder;
+
+    while(next())
+    {
+        std::vector<cv::Vec4f> lines;
+        startTimer();
+        cv::fastcv::houghLines(src, lines, threshold);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_main.cpp b/modules/fastcv/perf/perf_main.cpp
new file mode 100644
index 00000000000..a6824dfb007
--- /dev/null
+++ b/modules/fastcv/perf/perf_main.cpp
@@ -0,0 +1,8 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+CV_PERF_TEST_MAIN(imgproc)
diff --git a/modules/fastcv/perf/perf_matmul.cpp b/modules/fastcv/perf/perf_matmul.cpp
new file mode 100644
index 00000000000..83af7618b31
--- /dev/null
+++ b/modules/fastcv/perf/perf_matmul.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/> MatMulPerfParams;
+typedef perf::TestBaseWithParam<MatMulPerfParams> MatMulPerfTest;
+
+PERF_TEST_P(MatMulPerfTest, run,
+    ::testing::Combine(::testing::Values(8, 16, 128, 256), // rows1
+                       ::testing::Values(8, 16, 128, 256), // cols1
+                       ::testing::Values(8, 16, 128, 256)) // cols2
+           )
+{
+    auto p = GetParam();
+    int rows1 = std::get<0>(p);
+    int cols1 = std::get<1>(p);
+    int cols2 = std::get<2>(p);
+
+    RNG& rng = cv::theRNG();
+    Mat src1(rows1, cols1, CV_8SC1), src2(cols1, cols2, CV_8SC1);
+    cvtest::randUni(rng, src1, Scalar::all(-128), Scalar::all(128));
+    cvtest::randUni(rng, src2, Scalar::all(-128), Scalar::all(128));
+
+    Mat dst;
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::matmuls8s32(src1, src2, dst);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_meanshift.cpp b/modules/fastcv/perf/perf_meanshift.cpp
new file mode 100644
index 00000000000..e98c246d148
--- /dev/null
+++ b/modules/fastcv/perf/perf_meanshift.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<cv::Size, MatType, int /*iterations*/, float /*epsilon*/, Size /*winSize*/> MeanShiftPerfParams;
+typedef perf::TestBaseWithParam<MeanShiftPerfParams> MeanShiftPerfTest;
+
+PERF_TEST_P(MeanShiftPerfTest, run,
+    ::testing::Combine(::testing::Values(Size(128, 128), Size(640, 480), Size(800, 600)),
+                       ::testing::Values(CV_8U, CV_32S, CV_32F), // type
+                       ::testing::Values(2, 10, 100), // nIterations
+                       ::testing::Values(0.01f, 0.1f, 1.f, 10.f), // epsilon
+                       ::testing::Values(Size(8, 8), Size(13, 48), Size(64, 64)) // window size
+                       )
+           )
+{
+    auto p = GetParam();
+    cv::Size size = std::get<0>(p);
+    MatType type  = std::get<1>(p);
+    int iters     = std::get<2>(p);
+    float eps     = std::get<3>(p);
+    Size winSize  = std::get<4>(p);
+
+    RNG& rng = cv::theRNG();
+
+    const int nPts = 20;
+    Mat ptsMap(size, CV_8UC1, Scalar(255));
+    for(size_t i = 0; i < nPts; ++i)
+    {
+        ptsMap.at<uchar>(rng() % size.height, rng() % size.width) = 0;
+    }
+    Mat distTrans(size, CV_8UC1);
+    cv::distanceTransform(ptsMap, distTrans, DIST_L2, DIST_MASK_PRECISE);
+    Mat vsrc = 255 - distTrans;
+    Mat src;
+    vsrc.convertTo(src, type);
+
+    Point startPt(rng() % (size.width  - winSize.width),
+                  rng() % (size.height - winSize.height));
+    Rect startRect(startPt, winSize);
+
+    cv::TermCriteria termCrit( TermCriteria::EPS + TermCriteria::MAX_ITER, iters, eps);
+
+    Rect window = startRect;
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::meanShift(src, window, termCrit);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_mser.cpp b/modules/fastcv/perf/perf_mser.cpp
new file mode 100644
index 00000000000..4e1a6ce80af
--- /dev/null
+++ b/modules/fastcv/perf/perf_mser.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+// we use such nested structure to combine test values
+typedef std::tuple< std::tuple<bool /* useBboxes */, bool /* useContourData */>,
+                    int  /* numNeighbors */, std::string /*file path*/> MSERPerfParams;
+typedef perf::TestBaseWithParam<MSERPerfParams> MSERPerfTest;
+
+PERF_TEST_P(MSERPerfTest, run,
+    ::testing::Combine(::testing::Values(std::tuple<bool, bool> { true, false},
+                                         std::tuple<bool, bool> {false, false},
+                                         std::tuple<bool, bool> { true,  true}
+                                        ), // useBboxes, useContourData
+                       ::testing::Values(4, 8), // numNeighbors
+                       ::testing::Values("cv/shared/baboon.png", "cv/mser/puzzle.png")
+                      )
+           )
+{
+    auto p = GetParam();
+    bool useBboxes      = std::get<0>(std::get<0>(p));
+    bool useContourData = std::get<1>(std::get<0>(p));
+    int  numNeighbors   =             std::get<1>(p); // 4 or 8
+    std::string imgPath =             std::get<2>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
+
+    unsigned int delta = 2;
+    unsigned int minArea = 256;
+    unsigned int maxArea = (int)src.total()/4;
+    float        maxVariation = 0.15f;
+    float        minDiversity = 0.2f;
+
+    while(next())
+    {
+        std::vector<std::vector<Point>> contours;
+        std::vector<cv::Rect> bboxes;
+        std::vector<cv::fastcv::ContourData> contourData;
+
+        startTimer();
+        if (useBboxes)
+        {
+            if (useContourData)
+            {
+                cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
+                                 delta, minArea, maxArea, maxVariation, minDiversity);
+            }
+            else
+            {
+                cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
+                                 delta, minArea, maxArea, maxVariation, minDiversity);
+            }
+        }
+        else
+        {
+            cv::fastcv::MSER(src, contours, numNeighbors,
+                             delta, minArea, maxArea, maxVariation, minDiversity);
+        }
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_precomp.hpp b/modules/fastcv/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..e052a0098e2
--- /dev/null
+++ b/modules/fastcv/perf/perf_precomp.hpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef __FASTCV_EXT_PERF_PRECOMP_HPP__
+#define __FASTCV_EXT_PERF_PRECOMP_HPP__
+
+#include <opencv2/ts.hpp>
+#include <opencv2/features2d.hpp>
+#include <opencv2/fastcv.hpp>
+
+namespace opencv_test {
+using namespace perf;
+} // namespace
+
+#endif
diff --git a/modules/fastcv/perf/perf_threshold_range.cpp b/modules/fastcv/perf/perf_threshold_range.cpp
new file mode 100644
index 00000000000..a68e3b9f8d8
--- /dev/null
+++ b/modules/fastcv/perf/perf_threshold_range.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<cv::Size, int /*lowThresh*/, int /*highThresh*/, int /*trueValue*/, int /*falseValue*/> ThresholdRangePerfParams;
+typedef perf::TestBaseWithParam<ThresholdRangePerfParams> ThresholdRangePerfTest;
+
+PERF_TEST_P(ThresholdRangePerfTest, run,
+    ::testing::Combine(::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
+                       ::testing::Values(0, 15, 128, 255), // lowThresh
+                       ::testing::Values(0, 15, 128, 255), // highThresh
+                       ::testing::Values(0, 15, 128, 255), // trueValue
+                       ::testing::Values(0, 15, 128, 255)  // falseValue
+                       )
+           )
+{
+    auto p = GetParam();
+    cv::Size size  = std::get<0>(p);
+    int loThresh   = std::get<1>(p);
+    int hiThresh   = std::get<2>(p);
+    int trueValue  = std::get<3>(p);
+    int falseValue = std::get<4>(p);
+
+    int lowThresh  = std::min(loThresh, hiThresh);
+    int highThresh = std::max(loThresh, hiThresh);
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat dst;
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::thresholdRange(src, dst, lowThresh, highThresh, trueValue, falseValue);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+
+} // namespace
diff --git a/modules/fastcv/src/arithm.cpp b/modules/fastcv/src/arithm.cpp
new file mode 100644
index 00000000000..bf8077cbe7b
--- /dev/null
+++ b/modules/fastcv/src/arithm.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void matmuls8s32(InputArray _src1, InputArray _src2, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src1.empty() && _src1.type() == CV_8SC1);
+    CV_Assert(_src1.cols() <= 131072);
+    CV_Assert(_src1.step() % 8 == 0);
+    CV_Assert(_src1.cols() == _src2.rows());
+    Mat src1 = _src1.getMat();
+
+    CV_Assert(!_src2.empty() && _src2.type() == CV_8SC1);
+    CV_Assert(_src2.step() % 8 == 0);
+    Mat src2 = _src2.getMat();
+
+    _dst.create(_src1.rows(), _src2.cols(), CV_32SC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+    Mat dst = _dst.getMat();
+
+    fcvMatrixMultiplys8s32((const int8_t*)src1.data, src1.cols, src1.rows, src1.step,
+                           (const int8_t*)src2.data, src2.cols, src2.step,
+                           (int32_t*)dst.data, dst.step);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/bilateralFilter.cpp b/modules/fastcv/src/bilateralFilter.cpp
new file mode 100644
index 00000000000..1cd0ece6b14
--- /dev/null
+++ b/modules/fastcv/src/bilateralFilter.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
+{
+public:
+
+    FcvFilterLoop_Invoker(cv::Mat src_, size_t src_step_, cv::Mat dst_, size_t dst_step_, int width_, int height_,  int bdr_, int knl_, float32_t sigma_color_, float32_t sigma_space_) :
+        cv::ParallelLoopBody(), src_step(src_step_), dst_step(dst_step_), width(width_), height(height_),
+        bdr(bdr_), knl(knl_), sigma_color(sigma_color_), sigma_space(sigma_space_), src(src_), dst(dst_)
+    {
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+
+        fcvStatus status = FASTCV_SUCCESS;
+		int height_ = range.end - range.start;
+        int width_  = width;
+		cv::Mat src_;
+		int n = knl/2;
+
+		if(range.start == 0 && range.end == height)
+		{
+			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
+			cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
+		}
+		else if(range.start == 0)
+		{
+			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
+			cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
+		}
+		else if(range.end == (height))
+        {
+			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
+			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
+		}
+		else
+		{
+			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
+			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2*n)), src_, 0, 0, n, n, bdr);
+		}
+
+
+		cv::Mat dst_padded = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
+
+		if(knl == 5)
+		    status = fcvBilateralFilter5x5u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
+		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
+		else if(knl == 7)
+		    status = fcvBilateralFilter7x7u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
+		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
+		else if(knl == 9)
+		    status = fcvBilateralFilter9x9u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
+		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
+
+		cv::Mat dst_temp1 = dst_padded(cv::Rect(n, n, width_, height_));
+		cv::Mat dst_temp2 = dst(cv::Rect(0, range.start, width_, height_));
+		dst_temp1.copyTo(dst_temp2);
+	}
+
+private:
+    const size_t src_step;
+    const size_t dst_step;
+    const int width;
+    const int height;
+    const int bdr;
+    const int knl;
+    float32_t sigma_color;
+    float32_t sigma_space;
+    int ret;
+	cv::Mat src;
+	cv::Mat dst;
+
+    FcvFilterLoop_Invoker(const FcvFilterLoop_Invoker &);  // = delete;
+    const FcvFilterLoop_Invoker& operator= (const FcvFilterLoop_Invoker &);  // = delete;
+};
+
+void bilateralFilter( InputArray _src, OutputArray _dst, int d,
+                      float sigmaColor, float sigmaSpace,
+                      int borderType )
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty());
+    int type = _src.type();
+	CV_Assert(type == CV_8UC1);
+	CV_Assert(d == 5 || d == 7 || d == 9);
+
+    Size size = _src.size();
+	_dst.create( size, type );
+    Mat src = _src.getMat();
+	Mat dst = _dst.getMat();
+
+    if( sigmaColor <= 0 )
+        sigmaColor = 1;
+    if( sigmaSpace <= 0 )
+        sigmaSpace = 1;
+
+	int nStripes = 1;
+	if(src.rows/20 == 0)
+		nStripes = 1;
+	else
+		nStripes = (src.rows/20);
+
+	cv::parallel_for_(cv::Range(0, src.rows),
+              FcvFilterLoop_Invoker(src, src.step, dst, dst.step, src.cols, src.rows, borderType, d, sigmaColor, sigmaSpace), nStripes);
+}
+
+}
+}
diff --git a/modules/fastcv/src/cluster_euclidean.cpp b/modules/fastcv/src/cluster_euclidean.cpp
new file mode 100644
index 00000000000..a50b027c3a6
--- /dev/null
+++ b/modules/fastcv/src/cluster_euclidean.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void clusterEuclidean(InputArray _points, InputArray _clusterCenters, OutputArray _newClusterCenters,
+                      OutputArray _clusterSizes, OutputArray _clusterBindings, OutputArray _clusterSumDists,
+                      int numPointsUsed)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_points.empty() && _points.type() == CV_8UC1);
+    int nPts      = _points.rows();
+    int nDims     = _points.cols();
+    int ptsStride = _points.step();
+
+    CV_Assert(!_clusterCenters.empty() && _clusterCenters.depth() == CV_32F);
+    int nClusters           = _clusterCenters.rows();
+    int clusterCenterStride = _clusterCenters.step();
+
+    CV_Assert(_clusterCenters.cols() == nDims);
+
+    CV_Assert(numPointsUsed <= nPts);
+    if (numPointsUsed < 0)
+    {
+        numPointsUsed = nPts;
+    }
+
+    _newClusterCenters.create(nClusters, nDims, CV_32FC1);
+    _clusterSizes.create(1, nClusters, CV_32SC1);
+    _clusterBindings.create(1, numPointsUsed, CV_32SC1);
+    _clusterSumDists.create(1, nClusters, CV_32FC1);
+
+    Mat points            = _points.getMat();
+    Mat clusterCenters    = _clusterCenters.getMat();
+    Mat newClusterCenters = _newClusterCenters.getMat();
+    Mat clusterSizes      = _clusterSizes.getMat();
+    Mat clusterBindings   = _clusterBindings.getMat();
+    Mat clusterSumDists   = _clusterSumDists.getMat();
+
+    int result = fcvClusterEuclideanu8(points.data,
+                                       nPts,
+                                       nDims,
+                                       ptsStride,
+                                       numPointsUsed,
+                                       nClusters,
+                                       (float32_t*)clusterCenters.data,
+                                       clusterCenterStride,
+                                       (float32_t*)newClusterCenters.data,
+                                       (uint32_t*)clusterSizes.data,
+                                       (uint32_t*)clusterBindings.data,
+                                       (float32_t*)clusterSumDists.data);
+
+    if (result)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("Failed to clusterize, error code: %d", result));
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/fast10.cpp b/modules/fastcv/src/fast10.cpp
new file mode 100644
index 00000000000..0f8e54e5d8c
--- /dev/null
+++ b/modules/fastcv/src/fast10.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void FAST10(InputArray _src, InputArray _mask, OutputArray _coords, OutputArray _scores, int barrier, int border, bool nmsEnabled)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.cols() <= 2048);
+    CV_Assert(_src.step() % 8 == 0);
+
+    // segfaults at border <= 3, fixing it
+    border = std::max(4, border);
+
+    CV_Assert(_src.cols() > 2*border);
+    CV_Assert(_src.rows() > 2*border);
+
+    Mat src = _src.getMat();
+
+    Mat mask;
+    if (!_mask.empty())
+    {
+        CV_Assert(_mask.type() == CV_8UC1);
+        float kw = (float)src.cols  / (float)_mask.cols();
+        float kh = (float)src.rows  / (float)_mask.rows();
+        float eps = std::numeric_limits<float>::epsilon();
+        if (std::abs(kw - kh) > eps)
+        {
+            CV_Error(cv::Error::StsBadArg, "Mask proportions do not correspond to image proportions");
+        }
+        bool sizeFits = false;
+        for (int k = -3; k <= 3; k++)
+        {
+            if (std::abs(kw - std::pow(2.f, (float)k)) < eps)
+            {
+                sizeFits = true;
+                break;
+            }
+        }
+        if (!sizeFits)
+        {
+            CV_Error(cv::Error::StsBadArg, "Mask size do not correspond to image size divided by k from -3 to 3");
+        }
+
+        mask = _mask.getMat();
+    }
+
+    CV_Assert(_coords.needed());
+
+    const int maxCorners = 32768;
+
+    Mat coords(1, maxCorners * 2, CV_32SC1);
+
+    AutoBuffer<uint32_t> tempBuf;
+    Mat scores;
+    if  (_scores.needed())
+    {
+        scores.create(1, maxCorners, CV_32SC1);
+
+        tempBuf.allocate(maxCorners * 3 + src.rows + 1);
+    }
+
+    uint32_t nCorners = maxCorners;
+
+    if (!mask.empty())
+    {
+        if (!scores.empty())
+        {
+            fcvCornerFast10InMaskScoreu8(src.data, src.cols, src.rows, src.step,
+                                         barrier, border,
+                                         (uint32_t*)coords.data, (uint32_t*)scores.data, maxCorners, &nCorners,
+                                         mask.data, mask.cols, mask.rows,
+                                         nmsEnabled,
+                                         tempBuf.data());
+        }
+        else
+        {
+            fcvCornerFast10InMasku8(src.data, src.cols, src.rows, src.step,
+                                    barrier, border,
+                                    (uint32_t*)coords.data, maxCorners, &nCorners,
+                                    mask.data, mask.cols, mask.rows);
+        }
+    }
+    else
+    {
+        if (!scores.empty())
+        {
+            fcvCornerFast10Scoreu8(src.data, src.cols, src.rows, src.step,
+                                   barrier, border,
+                                   (uint32_t*)coords.data, (uint32_t*)scores.data, maxCorners, &nCorners,
+                                   nmsEnabled,
+                                   tempBuf.data());
+        }
+        else
+        {
+            fcvCornerFast10u8(src.data, src.cols, src.rows, src.step,
+                                    barrier, border,
+                                    (uint32_t*)coords.data, maxCorners, &nCorners);
+        }
+    }
+
+    _coords.create(1, nCorners*2, CV_32SC1);
+    coords(Range::all(), Range(0, nCorners*2)).copyTo(_coords);
+
+    if (_scores.needed())
+    {
+        scores(Range::all(), Range(0, nCorners)).copyTo(_scores);
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/fft.cpp b/modules/fastcv/src/fft.cpp
new file mode 100644
index 00000000000..c9610a328ab
--- /dev/null
+++ b/modules/fastcv/src/fft.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+static bool isPow2(int x)
+{
+    return x && (!(x & (x - 1)));
+}
+
+void FFT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(isPow2(_src.rows()) || _src.rows() == 1);
+    CV_Assert(isPow2(_src.cols()));
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_32FC2);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvStatus status = fcvFFTu8(src.data, src.cols, src.rows, src.step,
+                                (float*)dst.data, dst.step);
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+void IFFT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_32FC2);
+    CV_Assert(isPow2(_src.rows()) || _src.rows() == 1);
+    CV_Assert(isPow2(_src.cols()));
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_8UC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvStatus status = fcvIFFTf32((const float*)src.data, src.cols * 2, src.rows, src.step,
+                                  dst.data, dst.step);
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/fill_poly.cpp b/modules/fastcv/src/fill_poly.cpp
new file mode 100644
index 00000000000..3bb64ad4594
--- /dev/null
+++ b/modules/fastcv/src/fill_poly.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void fillConvexPoly(InputOutputArray _img, InputArray _pts, Scalar color)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_img.empty() && _img.depth() == CV_8U && _img.channels() <= 4);
+    CV_Assert(_img.cols() % 8 == 0);
+    CV_Assert(_img.step() % 8 == 0);
+
+    Mat img = _img.getMat();
+
+    CV_Assert(!_pts.empty() && (_pts.type() == CV_32SC1 || _pts.type() == CV_32SC2));
+    CV_Assert(_pts.isContinuous());
+    CV_Assert(_pts.total() * _pts.channels() % 2 == 0);
+
+    Mat pts = _pts.getMat();
+    uint32_t nPts = pts.total() * pts.channels() / 2;
+
+    Vec4b coloru8 = color;
+
+    fcvFillConvexPolyu8(nPts, (const uint32_t*)pts.data,
+                         img.channels(), coloru8.val,
+                         img.data, img.cols, img.rows, img.step);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/hough.cpp b/modules/fastcv/src/hough.cpp
new file mode 100644
index 00000000000..248f6b3517a
--- /dev/null
+++ b/modules/fastcv/src/hough.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void houghLines(InputArray _src, OutputArray _lines, double threshold)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    const uint32_t maxLines = 16384;
+
+    cv::Mat lines(1, maxLines, CV_32FC4);
+
+    uint32_t nLines = maxLines;
+
+    fcvHoughLineu8(src.data, src.cols, src.rows, src.step,
+                   (float)threshold, maxLines, &nLines, (fcvLine*)lines.data);
+
+    _lines.create(1, nLines, CV_32FC4);
+    lines(Range::all(), Range(0, nLines)).copyTo(_lines);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/moments.cpp b/modules/fastcv/src/moments.cpp
new file mode 100644
index 00000000000..3a0c4249eef
--- /dev/null
+++ b/modules/fastcv/src/moments.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+cv::Moments moments(InputArray _src, bool binary)
+{
+	INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty());
+    int type = _src.type();
+	CV_Assert(type == CV_8UC1 || type == CV_32SC1 || type == CV_32FC1);
+
+    Size size = _src.size();
+    Mat src = _src.getMat();
+
+    cv::Moments m;
+	if( size.width == 0 || size.height == 0 )
+        return m;
+
+	fcvMoments* mFCV = new fcvMoments();
+    fcvStatus status = FASTCV_SUCCESS;
+	if(binary)
+    {
+		cv::Mat src_binary(size, CV_8UC1);
+		cv::compare( src, 0, src_binary, cv::CMP_NE );
+		fcvImageMomentsu8(src_binary.data, src_binary.cols,
+		                  src_binary.rows, src_binary.step, mFCV, binary);
+    }
+	else
+	{
+		switch(type)
+		{
+			case CV_8UC1:
+			    fcvImageMomentsu8(src.data, src.cols, src.rows,
+				                  src.step, mFCV, binary);
+				break;
+			case CV_32SC1:
+			    fcvImageMomentss32((const int*)src.data, src.cols, src.rows,
+				                  src.step, mFCV, binary);
+				break;
+			case CV_32FC1:
+			    fcvImageMomentsf32((const float*)src.data, src.cols, src.rows,
+				                  src.step, mFCV, binary);
+				break;
+		}
+	}
+
+	if (status != FASTCV_SUCCESS)
+    {
+        CV_Error( cv::Error::StsError, cv::format("Error occurred!") );
+		delete mFCV;
+        return m;
+    }
+
+	m.m00  = mFCV->m00;  m.m10  = mFCV->m10;  m.m01  = mFCV->m01;
+	m.m20  = mFCV->m20;  m.m11  = mFCV->m11;  m.m02  = mFCV->m02;
+	m.m30  = mFCV->m30;  m.m21  = mFCV->m21;  m.m12  = mFCV->m12;
+	m.m03  = mFCV->m03;  m.mu02 = mFCV->mu02; m.m03  = mFCV->mu03;
+	m.mu11 = mFCV->mu11; m.mu12 = mFCV->mu12; m.mu20 = mFCV->mu20;
+	m.mu21 = mFCV->mu21; m.mu30 = mFCV->mu30;
+
+	float32_t inv_m00 = 1.0/mFCV->m00;
+	float32_t inv_sqrt_m00 = mFCV->inv_sqrt_m00;
+    float32_t s2 = inv_m00 * inv_m00, s3 = s2 * inv_sqrt_m00;
+
+    m.nu20 = mFCV->mu20 * s2; m.nu11 = mFCV->mu11 * s2;
+	m.nu02 = mFCV->mu02 * s2; m.nu30 = mFCV->mu30 * s3;
+	m.nu21 = mFCV->mu21 * s3; m.nu12 = mFCV->mu12 * s3;
+	m.nu03 = mFCV->mu03 * s3;
+
+    delete mFCV;
+    return m;
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/mser.cpp b/modules/fastcv/src/mser.cpp
new file mode 100644
index 00000000000..ae8519313be
--- /dev/null
+++ b/modules/fastcv/src/mser.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                    std::vector<ContourData>& contourData,
+                    bool useBoundingBoxes = true,
+                    bool useContourData = true,
+                    unsigned int numNeighbors = 4,
+                    unsigned int delta = 2,
+                    unsigned int minArea = 30,
+                    unsigned int maxArea = 14400,
+                    float        maxVariation = 0.15f,
+                    float        minDiversity = 0.2f)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.cols() > 50);
+    CV_Assert(_src.rows() > 5);
+
+    Mat src = _src.getMat();
+
+    CV_Assert(numNeighbors == 4 || numNeighbors == 8);
+    bool useNN4 = (numNeighbors == 4);
+
+    bool usePointsArray = !useNN4;
+
+    void *mserHandle;
+
+    bool isInitOk = false;
+    if (useNN4)
+    {
+        isInitOk = fcvMserInit(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
+    }
+    else
+    {
+        isInitOk = fcvMserNN8Init(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
+    }
+
+    if (!isInitOk)
+    {
+        CV_Error(cv::Error::StsInternal, "Failed to initialize MSER");
+    }
+
+    //bufSize for pts and bboxes
+    const unsigned int maxContours = 16384;
+    unsigned int numContours;
+    std::vector<uint32_t> numPointsInContour(maxContours);
+
+    std::vector<uint16_t> rectArray;
+    rectArray.resize(4 * maxContours); // xMin, xMax, yMax, yMin
+
+    unsigned int pointsArraySize = src.total() * 30; // Recommended typical size
+    std::vector<uint16_t> pointsArray;
+    std::vector<uint32_t> contourStartingPoints;
+    uint32_t pathArraySize = src.total() * 4; // Recommended size
+    std::vector<uint16_t> pathArray;
+    if (usePointsArray)
+    {
+        pointsArray.resize(pointsArraySize);
+    }
+    else
+    {
+        contourStartingPoints.resize(maxContours);
+        pathArray.resize(pathArraySize);
+    }
+
+    std::vector<uint32_t> contourVariation(maxContours), contourNodeId(maxContours), contourNodeCounter(maxContours);
+    std::vector<int8_t> contourPolarity(maxContours);
+
+    int mserRetcode = -1;
+    if (useNN4)
+    {
+        mserRetcode = fcvMserExtu8_v3(mserHandle, src.data, src.cols, src.rows, src.step,
+                                      maxContours, &numContours,
+                                      rectArray.data(),
+                                      contourStartingPoints.data(),
+                                      numPointsInContour.data(),
+                                      pathArraySize, pathArray.data(),
+                                      contourVariation.data(), contourPolarity.data(), contourNodeId.data(), contourNodeCounter.data());
+        CV_LOG_INFO(NULL, "fcvMserExtu8_v3");
+    }
+    else
+    {
+        if (useContourData)
+        {
+            mserRetcode = fcvMserExtNN8u8(mserHandle, src.data, src.cols, src.rows, src.step,
+                                          maxContours, &numContours,
+                                          rectArray.data(),
+                                          numPointsInContour.data(), pointsArraySize, pointsArray.data(),
+                                          contourVariation.data(), contourPolarity.data(), contourNodeId.data(), contourNodeCounter.data());
+            CV_LOG_INFO(NULL, "fcvMserExtNN8u8");
+        }
+        else
+        {
+            mserRetcode = fcvMserNN8u8(mserHandle, src.data, src.cols, src.rows, src.step,
+                                       maxContours, &numContours,
+                                       rectArray.data(),
+                                       numPointsInContour.data(), pointsArraySize, pointsArray.data());
+            CV_LOG_INFO(NULL, "fcvMserNN8u8");
+        }
+    }
+
+    if (mserRetcode != 1)
+    {
+        CV_Error(cv::Error::StsInternal, "Failed to run MSER");
+    }
+
+    contours.clear();
+    contours.reserve(numContours);
+    if (useBoundingBoxes)
+    {
+        boundingBoxes.clear();
+        boundingBoxes.reserve(numContours);
+    }
+    if (useContourData)
+    {
+        contourData.clear();
+        contourData.reserve(numContours);
+    }
+    int ptCtr = 0;
+    for (uint32_t i = 0; i < numContours; i++)
+    {
+        std::vector<Point> contour;
+        contour.reserve(numPointsInContour[i]);
+        for (uint32_t j = 0; j < numPointsInContour[i]; j++)
+        {
+            Point pt;
+            if (usePointsArray)
+            {
+                uint32_t idx = (ptCtr + j) * 2;
+                pt = Point {pointsArray[idx + 0], pointsArray[idx + 1]};
+            }
+            else
+            {
+                uint32_t idx = contourStartingPoints[i] + j * 2;
+                pt = Point {pathArray[idx + 0], pathArray[idx + 1]};
+            }
+            contour.push_back(pt);
+        }
+        contours.push_back(contour);
+        ptCtr += numPointsInContour[i];
+
+        if (useBoundingBoxes)
+        {
+            uint16_t xMin = rectArray[i * 4 + 0];
+            uint16_t xMax = rectArray[i * 4 + 1];
+            uint16_t yMax = rectArray[i * 4 + 2];
+            uint16_t yMin = rectArray[i * 4 + 3];
+            // +1 is because max limit in cv::Rect() is exclusive
+            cv::Rect bbox(Point {xMin, yMin},
+                          Point {xMax + 1, yMax + 1});
+            boundingBoxes.push_back(bbox);
+        }
+
+        if (useContourData)
+        {
+            ContourData data;
+            data.variation   = contourVariation[i];
+            data.polarity    = contourPolarity[i];
+            data.nodeId      = contourNodeId[i];
+            data.nodeCounter = contourNodeCounter[i];
+            contourData.push_back(data);
+        }
+    }
+
+    fcvMserRelease(mserHandle);
+}
+
+void MSER(InputArray _src, std::vector<std::vector<Point>> &contours,
+          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+{
+    std::vector<cv::Rect> boundingBoxes;
+    std::vector<ContourData> contourData;
+    runMSER(_src, contours, boundingBoxes, contourData, false, false, numNeighbors,
+            delta, minArea, maxArea, maxVariation, minDiversity);
+}
+
+void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+{
+    std::vector<ContourData> contourData;
+    runMSER(_src, contours, boundingBoxes, contourData, true, false, numNeighbors,
+            delta, minArea, maxArea, maxVariation, minDiversity);
+}
+
+void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes, std::vector<ContourData>& contourData,
+          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+{
+    runMSER(_src, contours, boundingBoxes, contourData, true, true, numNeighbors,
+            delta, minArea, maxArea, maxVariation, minDiversity);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/precomp.hpp b/modules/fastcv/src/precomp.hpp
new file mode 100644
index 00000000000..d33cb25bafb
--- /dev/null
+++ b/modules/fastcv/src/precomp.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_PRECOMP_HPP
+#define OPENCV_FASTCV_PRECOMP_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include "opencv2/core/private.hpp"
+#include "opencv2/core/utils/logger.hpp"
+
+#include <opencv2/fastcv.hpp>
+#include <map>
+
+#include "fastcv.h"
+
+namespace cv {
+namespace fastcv {
+
+#define INITIALIZATION_CHECK                                                \
+{                                                                           \
+    if (!FastCvContext::getContext().isInitialized)                         \
+    {                                                                       \
+        CV_Error(cv::Error::StsBadArg, cv::format("Set mode failed!"));     \
+    }                                                                       \
+    CV_INSTRUMENT_REGION();                                                 \
+}
+
+const std::map<fcvStatus, std::string> fcvStatusStrings =
+{
+    { FASTCV_SUCCESS,       "Success"},
+    { FASTCV_EFAIL,         "General failure"},
+    { FASTCV_EUNALIGNPARAM, "Unaligned pointer parameter"},
+    { FASTCV_EBADPARAM,     "Bad parameters"},
+    { FASTCV_EINVALSTATE,   "Called at invalid state"},
+    { FASTCV_ENORES,        "Insufficient resources, memory, thread"},
+    { FASTCV_EUNSUPPORTED,  "Unsupported feature"},
+    { FASTCV_EHWQDSP,       "Hardware QDSP failed to respond"},
+    { FASTCV_EHWGPU,        "Hardware GPU failed to respond"},
+};
+
+struct FastCvContext
+{
+public:
+    // initialize at first call
+    // Defines a static local variable context. Variable is created only once.
+    static FastCvContext& getContext()
+    {
+        static FastCvContext context;
+        return context;
+    }
+
+    FastCvContext()
+    {
+        if (fcvSetOperationMode(FASTCV_OP_CPU_PERFORMANCE) != 0)
+        {
+            CV_LOG_WARNING(NULL, "Failed to switch FastCV operation mode");
+            isInitialized = false;
+        }
+        else
+        {
+            CV_LOG_INFO(NULL, "FastCV Operation Mode Switched");
+            isInitialized = true;
+        }
+    }
+
+    bool isInitialized;
+};
+
+} // namespace fastcv
+} // namespace cv
+
+#endif // OPENCV_FASTCV_PRECOMP_HPP
diff --git a/modules/fastcv/src/remap.cpp b/modules/fastcv/src/remap.cpp
new file mode 100644
index 00000000000..a0b4849ac72
--- /dev/null
+++ b/modules/fastcv/src/remap.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class RemapParallel : public cv::ParallelLoopBody {
+public:
+    RemapParallel(int src_type, const uint8_t* src, unsigned int srcWidth, unsigned int srcHeight, unsigned int srcStride, uint8_t* dst,
+                unsigned int dstWidth, unsigned int dstHeight, unsigned int dstStride, const float32_t* __restrict  mapX,
+                const float32_t* __restrict mapY, uint32_t mapStride, fcvInterpolationType interpolation, uint8_t borderValue)
+                : src_type_(src_type), src_(src), srcWidth_(srcWidth), srcHeight_(srcHeight), srcStride_(srcStride), dst_(dst), dstWidth_(dstWidth),
+                dstHeight_(dstHeight), dstStride_(dstStride), mapX_(mapX), mapY_(mapY), mapStride_(mapStride),
+                fcvInterpolation_(interpolation), borderValue_(borderValue) {}
+
+    void operator()(const cv::Range& range) const override {
+        CV_UNUSED(srcHeight_);
+        CV_UNUSED(dstHeight_);
+        int rangeHeight = range.end-range.start;
+        fcvStatus   status = FASTCV_SUCCESS;
+        if(src_type_==CV_8UC1)
+        {
+            status = fcvRemapu8_v2(src_ + range.start*srcStride_, srcWidth_, rangeHeight, srcStride_, dst_ + range.start*dstStride_,
+                            srcWidth_, rangeHeight, dstStride_, mapX_, mapY_, mapStride_, fcvInterpolation_, FASTCV_BORDER_CONSTANT, borderValue_);
+        }
+        else if(src_type_==CV_8UC4)
+        {
+            if(fcvInterpolation_ == FASTCV_INTERPOLATION_TYPE_BILINEAR)
+            {
+                fcvRemapRGBA8888BLu8(src_ + range.start*srcStride_, srcWidth_, rangeHeight, srcStride_, dst_ + range.start*dstStride_, dstWidth_, rangeHeight,
+                                    dstStride_, mapX_, mapY_, mapStride_);
+            }
+            else if(fcvInterpolation_ == FASTCV_INTERPOLATION_TYPE_NEAREST_NEIGHBOR)
+            {
+                fcvRemapRGBA8888NNu8(src_ + range.start*srcStride_, srcWidth_, rangeHeight, srcStride_, dst_ + range.start*dstStride_, dstWidth_, rangeHeight,
+                                    dstStride_, mapX_, mapY_, mapStride_);
+            }
+        }
+
+        if(status!=FASTCV_SUCCESS)
+        {
+			std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+            CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+        }
+    }
+
+private:
+    int src_type_;
+    const uint8_t* src_;
+    unsigned int srcWidth_;
+    unsigned int srcHeight_;
+    unsigned int srcStride_;
+    uint8_t* dst_;
+    unsigned int dstWidth_;
+    unsigned int dstHeight_;
+    unsigned int dstStride_;
+    const float32_t* __restrict mapX_;
+    const float32_t* __restrict mapY_;
+    unsigned int mapStride_;
+    fcvInterpolationType fcvInterpolation_;
+    uint8_t borderValue_;
+};
+
+void remap(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _map1, cv::InputArray _map2,
+                      int interpolation, int borderValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(_src.type() == CV_8UC1);
+    CV_Assert(_map1.type()==CV_32FC1);
+    CV_Assert(interpolation == cv::InterpolationFlags::INTER_NEAREST || interpolation == cv::InterpolationFlags::INTER_LINEAR);
+    CV_Assert(!_map1.empty() && !_map2.empty());
+    CV_Assert(_map1.size() == _map2.size());
+    CV_Assert(borderValue >= 0 && borderValue < 256);
+
+    Size size = _map1.size();
+    int type = _src.type();
+    _dst.create( size, type);
+
+    Mat src = _src.getMat();
+    Mat map1 = _map1.getMat();
+    Mat map2 = _map2.getMat();
+    Mat dst = _dst.getMat();
+    CV_Assert(map1.step == map2.step);
+    fcvStatus               status = FASTCV_SUCCESS;
+    fcvInterpolationType    fcvInterpolation;
+
+    if(interpolation==cv::InterpolationFlags::INTER_NEAREST)
+        fcvInterpolation = FASTCV_INTERPOLATION_TYPE_NEAREST_NEIGHBOR;
+    else
+        fcvInterpolation = FASTCV_INTERPOLATION_TYPE_BILINEAR;
+
+
+    cv::parallel_for_(cv::Range(0, src.rows), RemapParallel(CV_8UC1, src.data, src.cols, src.rows, src.step, dst.data, dst.cols, dst.rows, dst.step,
+    (float32_t*)map1.data, (float32_t*)map2.data, map1.step, fcvInterpolation, borderValue), (src.cols*src.rows)/(double)(1 << 16));
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+void remapRGBA(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _map1, cv::InputArray _map2, int interpolation)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(_src.type() == CV_8UC4);
+    CV_Assert(_map1.type()==CV_32FC1);
+    CV_Assert(interpolation == cv::InterpolationFlags::INTER_NEAREST || interpolation == cv::InterpolationFlags::INTER_LINEAR);
+    CV_Assert(!_map1.empty() && !_map2.empty());
+    CV_Assert(_map1.size() == _map2.size());
+
+    Size size = _map1.size();
+    int type = _src.type();
+    _dst.create( size, type);
+
+    Mat src = _src.getMat();
+    Mat map1 = _map1.getMat();
+    Mat map2 = _map2.getMat();
+    Mat dst = _dst.getMat();
+    CV_Assert(map1.step == map2.step);
+    fcvStatus               status = FASTCV_SUCCESS;
+    fcvInterpolationType    fcvInterpolation;
+
+    if(interpolation==cv::InterpolationFlags::INTER_NEAREST)
+        fcvInterpolation = FASTCV_INTERPOLATION_TYPE_NEAREST_NEIGHBOR;
+    else
+        fcvInterpolation = FASTCV_INTERPOLATION_TYPE_BILINEAR;
+
+    cv::parallel_for_(cv::Range(0, src.rows), RemapParallel(CV_8UC4, src.data, src.cols, src.rows, src.step, dst.data, dst.cols, dst.rows, dst.step,
+    (float32_t*)map1.data, (float32_t*)map2.data, map1.step, fcvInterpolation, 0), (src.cols*src.rows)/(double)(1 << 16) );
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/scale.cpp b/modules/fastcv/src/scale.cpp
new file mode 100644
index 00000000000..3e1a3a74b8a
--- /dev/null
+++ b/modules/fastcv/src/scale.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void resizeDownBy2(cv::InputArray _src, cv::OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Mat src = _src.getMat();
+    CV_Assert((src.cols & 1)==0 && (src.rows & 1)==0);
+
+    int type = _src.type();
+    cv::Size dsize(src.cols / 2, src.rows / 2);
+
+    _dst.create(dsize, type);
+
+    Mat dst = _dst.getMat();
+
+    fcvStatus status = (fcvStatus)fcvScaleDownBy2u8_v2((const uint8_t*)src.data, src.cols, src.rows, src.step, (uint8_t*)dst.data,
+        src.cols/2);
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+void resizeDownBy4(cv::InputArray _src, cv::OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Mat src = _src.getMat();
+    CV_Assert((src.cols & 3)==0 && (src.rows & 3)==0);
+
+    int type = _src.type();
+    cv::Size dsize(src.cols / 4, src.rows / 4);
+
+    _dst.create(dsize, type);
+
+    Mat dst = _dst.getMat();
+
+    fcvStatus status = (fcvStatus)fcvScaleDownBy4u8_v2((const uint8_t*)src.data, src.cols, src.rows, src.step,
+        (uint8_t*)dst.data, src.cols/4);
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/shift.cpp b/modules/fastcv/src/shift.cpp
new file mode 100644
index 00000000000..6dfd71a6988
--- /dev/null
+++ b/modules/fastcv/src/shift.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+int meanShift(InputArray _src, Rect& rect, TermCriteria termCrit)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && (_src.type() == CV_8UC1 || _src.type() == CV_32SC1 || _src.type() == CV_32FC1));
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    fcvRectangleInt window;
+    window.x = rect.x;
+    window.y = rect.y;
+    window.width  = rect.width;
+    window.height = rect.height;
+
+    fcvTermCriteria criteria;
+    criteria.epsilon  = (termCrit.type & TermCriteria::EPS) ? termCrit.epsilon : 0;
+    criteria.max_iter = (termCrit.type & TermCriteria::COUNT) ? termCrit.maxCount : 1024;
+    uint32_t nIterations = 0;
+    if (src.depth() == CV_8U)
+    {
+        nIterations = fcvMeanShiftu8(src.data, src.cols, src.rows, src.step,
+                                     &window, criteria);
+    }
+    else if (src.depth() == CV_32S)
+    {
+        nIterations = fcvMeanShifts32((const int *)src.data, src.cols, src.rows, src.step,
+                                      &window, criteria);
+    }
+    else if (src.depth() == CV_32F)
+    {
+        nIterations = fcvMeanShiftf32((const float*)src.data, src.cols, src.rows, src.step,
+                                       &window, criteria);
+    }
+
+    rect.x = window.x;
+    rect.y = window.y;
+    rect.width  = window.width;
+    rect.height = window.height;
+
+    return nIterations;
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/smooth.cpp b/modules/fastcv/src/smooth.cpp
new file mode 100644
index 00000000000..74b72db7528
--- /dev/null
+++ b/modules/fastcv/src/smooth.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void bilateralRecursive(cv::InputArray _src, cv::OutputArray _dst, float sigmaColor, float sigmaSpace)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Size size = _src.size();
+    int type = _src.type();
+    _dst.create(size, type);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    fcvStatus status  = fcvBilateralFilterRecursiveu8(src.data, src.cols, src.rows, src.step,
+                                                      dst.data, dst.step, sigmaColor, sigmaSpace);
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/thresh.cpp b/modules/fastcv/src/thresh.cpp
new file mode 100644
index 00000000000..c97a3656039
--- /dev/null
+++ b/modules/fastcv/src/thresh.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void thresholdRange(InputArray _src, OutputArray _dst, uint8_t lowThresh, uint8_t highThresh, uint8_t trueValue, uint8_t falseValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(lowThresh <= highThresh);
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+    Mat src = _src.getMat();
+
+    _dst.create(_src.size(), CV_8UC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+    Mat dst = _dst.getMat();
+
+    fcvStatus status = fcvFilterThresholdRangeu8_v2(src.data, src.cols, src.rows, src.step,
+                                                    dst.data, dst.step,
+                                                    lowThresh, highThresh, trueValue, falseValue);
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/utils.cpp b/modules/fastcv/src/utils.cpp
new file mode 100644
index 00000000000..81723b0976c
--- /dev/null
+++ b/modules/fastcv/src/utils.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+} // namespace fastcv
+} // namespace cv
diff --git a/modules/fastcv/test/test_arithm.cpp b/modules/fastcv/test/test_arithm.cpp
new file mode 100644
index 00000000000..39979908136
--- /dev/null
+++ b/modules/fastcv/test/test_arithm.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/> MatMulTestParams;
+class MatMulTest : public ::testing::TestWithParam<MatMulTestParams> {};
+
+TEST_P(MatMulTest, accuracy)
+{
+    auto p = GetParam();
+    int rows1 = std::get<0>(p);
+    int cols1 = std::get<1>(p);
+    int cols2 = std::get<2>(p);
+
+    RNG& rng = cv::theRNG();
+    Mat src1(rows1, cols1, CV_8SC1), src2(cols1, cols2, CV_8SC1);
+    cvtest::randUni(rng, src1, Scalar::all(-128), Scalar::all(128));
+    cvtest::randUni(rng, src2, Scalar::all(-128), Scalar::all(128));
+
+    Mat dst;
+    cv::fastcv::matmuls8s32(src1, src2, dst);
+    Mat fdst;
+    dst.convertTo(fdst, CV_32F);
+
+    Mat fsrc1, fsrc2;
+    src1.convertTo(fsrc1, CV_32F);
+    src2.convertTo(fsrc2, CV_32F);
+    Mat ref;
+    cv::gemm(fsrc1, fsrc2, 1.0, noArray(), 0, ref, 0);
+
+    double normInf = cvtest::norm(ref, fdst, cv::NORM_INF);
+    double normL2  = cvtest::norm(ref, fdst, cv::NORM_L2);
+
+    EXPECT_EQ(normInf, 0);
+    EXPECT_EQ(normL2, 0);
+
+    if (cvtest::debugLevel > 0 && (normInf > 0 || normL2 > 0))
+    {
+        std::ofstream of(cv::format("out_%d_%d_%d.txt", rows1, cols1, cols2));
+        of << ref << std::endl;
+        of << dst << std::endl;
+        of.close();
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, MatMulTest,
+                         ::testing::Combine(::testing::Values(8, 16, 128, 256),   // rows1
+                                            ::testing::Values(8, 16, 128, 256),   // cols1
+                                            ::testing::Values(8, 16, 128, 256))); // cols2
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_bilateral.cpp b/modules/fastcv/test/test_bilateral.cpp
new file mode 100644
index 00000000000..4f582c2ed37
--- /dev/null
+++ b/modules/fastcv/test/test_bilateral.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<cv::Size,int,int>> fcv_bilateralFilterTest;
+
+TEST_P(fcv_bilateralFilterTest, accuracy)
+{	
+    cv::Size size  = get<0>(GetParam());
+	int d = get<1>(GetParam());
+    double sigmaColor = get<2>(GetParam());
+	double sigmaSpace = sigmaColor;
+	
+	RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    cv::Mat dst;
+
+	cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+	
+    EXPECT_FALSE(dst.empty());
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, fcv_bilateralFilterTest, Combine(
+                   ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
+                   ::testing::Values(5, 7, 9),
+                   ::testing::Values(1., 10.)
+));
+
+}
+}
+
diff --git a/modules/fastcv/test/test_cluster_euclidean.cpp b/modules/fastcv/test/test_cluster_euclidean.cpp
new file mode 100644
index 00000000000..c108f75489a
--- /dev/null
+++ b/modules/fastcv/test/test_cluster_euclidean.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+// nPts, nDims, nClusters
+typedef std::tuple<int, int, int> ClusterEuclideanTestParams;
+class ClusterEuclideanTest : public ::testing::TestWithParam<ClusterEuclideanTestParams> {};
+
+TEST_P(ClusterEuclideanTest, accuracy)
+{
+    auto p = GetParam();
+    int nPts      = std::get<0>(p);
+    int nDims     = std::get<1>(p);
+    int nClusters = std::get<2>(p);
+
+    Mat points(nPts, nDims, CV_8U);
+    Mat clusterCenters(nClusters, nDims, CV_32F);
+
+    Mat trueMeans(nClusters, nDims, CV_32F);
+    Mat stddevs(nClusters, nDims, CV_32F);
+    std::vector<int> trueClusterSizes(nClusters, 0);
+    std::vector<int> trueClusterBindings(nPts, 0);
+    std::vector<float> trueSumDists(nClusters, 0);
+
+    cv::RNG& rng = cv::theRNG();
+    for (int i = 0; i < nClusters; i++)
+    {
+        Mat mean(1, nDims, CV_64F), stdev(1, nDims, CV_64F);
+        rng.fill(mean,  cv::RNG::UNIFORM, 0, 256);
+        rng.fill(stdev, cv::RNG::UNIFORM, 5.f, 16);
+        int lo =    i    * nPts / nClusters;
+        int hi = (i + 1) * nPts / nClusters;
+
+        for (int d = 0; d < nDims; d++)
+        {
+            rng.fill(points.col(d).rowRange(lo, hi), cv::RNG::NORMAL,
+                     mean.at<double>(d), stdev.at<double>(d));
+        }
+
+        float sd = 0;
+        for (int j = lo; j < hi; j++)
+        {
+            Mat pts64f;
+            points.row(j).convertTo(pts64f, CV_64F);
+            sd += cv::norm(mean, pts64f, NORM_L2);
+            trueClusterBindings.at(j) = i;
+            trueClusterSizes.at(i)++;
+        }
+        trueSumDists.at(i) = sd;
+
+        // let's shift initial cluster center a bit
+        Mat(mean + stdev * 0.5).copyTo(clusterCenters.row(i));
+
+        mean.copyTo(trueMeans.row(i));
+        stdev.copyTo(stddevs.row(i));
+    }
+
+    Mat newClusterCenters;
+    std::vector<int> clusterSizes, clusterBindings;
+    std::vector<float> clusterSumDists;
+    cv::fastcv::clusterEuclidean(points, clusterCenters, newClusterCenters, clusterSizes, clusterBindings, clusterSumDists);
+
+    if (cvtest::debugLevel > 0 && nDims == 2)
+    {
+        Mat draw(256, 256, CV_8UC3, Scalar(0));
+        for (int i = 0; i < nPts; i++)
+        {
+            int x = std::rint(points.at<uchar>(i, 0));
+            int y = std::rint(points.at<uchar>(i, 1));
+            draw.at<Vec3b>(y, x) = Vec3b::all(128);
+        }
+        for (int i = 0; i < nClusters; i++)
+        {
+            float cx = trueMeans.at<double>(i, 0);
+            float cy = trueMeans.at<double>(i, 1);
+            draw.at<Vec3b>(cy, cx) = Vec3b(0, 255, 0);
+
+            float sx = stddevs.at<double>(i, 0);
+            float sy = stddevs.at<double>(i, 1);
+            cv::ellipse(draw, Point(cx, cy), Size(sx, sy), 0, 0, 360, Scalar(255, 0, 0));
+
+            float ox = clusterCenters.at<float>(i, 0);
+            float oy = clusterCenters.at<float>(i, 1);
+            draw.at<Vec3b>(oy, ox) = Vec3b(0, 0, 255);
+
+            float nx = newClusterCenters.at<float>(i, 0);
+            float ny = newClusterCenters.at<float>(i, 1);
+            draw.at<Vec3b>(ny, nx) = Vec3b(255, 255, 0);
+        }
+        cv::imwrite(cv::format("draw_%d_%d_%d.png", nPts, nDims, nClusters), draw);
+    }
+
+    {
+        std::vector<double> diffs;
+        for (int i = 0; i < nClusters; i++)
+        {
+            double cs = std::abs((trueClusterSizes[i] - clusterSizes[i]) / double(trueClusterSizes[i]));
+            diffs.push_back(cs);
+        }
+        double normL2  = cv::norm(diffs, NORM_L2) / nClusters;
+
+        EXPECT_LT(normL2, 0.392);
+    }
+
+    {
+        Mat bindings8u, trueBindings8u;
+        Mat(clusterBindings).convertTo(bindings8u, CV_8U);
+        Mat(trueClusterBindings).convertTo(trueBindings8u, CV_8U);
+        double normH = cv::norm(bindings8u, trueBindings8u, NORM_HAMMING) / nPts;
+        EXPECT_LT(normH, 0.658);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, ClusterEuclideanTest,
+                        ::testing::Combine(::testing::Values(100, 1000, 10000), // nPts
+                                           ::testing::Values(2, 10, 32),        // nDims
+                                           ::testing::Values(5, 10, 16)));      // nClusters
+
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_fast10.cpp b/modules/fastcv/test/test_fast10.cpp
new file mode 100644
index 00000000000..51123a1b3ab
--- /dev/null
+++ b/modules/fastcv/test/test_fast10.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<bool /*useScores*/, int /*barrier*/, int /*border*/, bool /*nmsEnabled*/> Fast10TestParams;
+class Fast10Test : public ::testing::TestWithParam<Fast10TestParams> {};
+
+TEST_P(Fast10Test, accuracy)
+{
+    auto p = GetParam();
+    bool useScores  = std::get<0>(p);
+    int barrier     = std::get<1>(p);
+    int border      = std::get<2>(p);
+    bool nmsEnabled = std::get<3>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<int> coords, scores;
+    cv::fastcv::FAST10(src, noArray(), coords, useScores ? scores : noArray(), barrier, border, nmsEnabled);
+
+    std::vector<KeyPoint> ocvKeypoints;
+    int thresh = barrier;
+    cv::FAST(src, ocvKeypoints, thresh, nmsEnabled, FastFeatureDetector::DetectorType::TYPE_9_16 );
+
+    if (useScores)
+    {
+        ASSERT_EQ(scores.size() * 2, coords.size());
+    }
+
+    Mat ptsMap(src.size(), CV_8U, Scalar(255));
+    for(size_t i = 0; i < coords.size() / 2; ++i)
+    {
+        ptsMap.at<uchar>(coords[2*i + 1], coords[2*i + 0]) = 0;
+    }
+    Mat distTrans(src.size(), CV_8U);
+    cv::distanceTransform(ptsMap, distTrans, DIST_L2, DIST_MASK_PRECISE);
+
+    Mat refPtsMap(src.size(), CV_8U, Scalar(255));
+    for(size_t i = 0; i < ocvKeypoints.size(); ++i)
+    {
+        refPtsMap.at<uchar>(ocvKeypoints[i].pt) = 0;
+    }
+    Mat refDistTrans(src.size(), CV_8U);
+    cv::distanceTransform(refPtsMap, refDistTrans, DIST_L2, DIST_MASK_PRECISE);
+
+    double normInf = cvtest::norm(refDistTrans, distTrans, cv::NORM_INF);
+    double normL2  = cvtest::norm(refDistTrans, distTrans, cv::NORM_L2)  / src.size().area();
+
+    EXPECT_LT(normInf, 129.7);
+    EXPECT_LT(normL2, 0.067);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Fast10Test,
+                        ::testing::Combine(::testing::Bool(),   // useScores
+                                           ::testing::Values(10, 30, 50), // barrier
+                                           ::testing::Values( 4, 10, 32), // border
+                                           ::testing::Bool() // nonmax suppression
+                                           ));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_fft.cpp b/modules/fastcv/test/test_fft.cpp
new file mode 100644
index 00000000000..18b53d88ba0
--- /dev/null
+++ b/modules/fastcv/test/test_fft.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+class FFTExtTest : public ::testing::TestWithParam<cv::Size> {};
+
+TEST_P(FFTExtTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat dst, ref;
+    cv::fastcv::FFT(src, dst);
+
+    cv::dft(srcFloat, ref, DFT_COMPLEX_OUTPUT);
+
+    double normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+    double normL2  = cvtest::norm(dst, ref, cv::NORM_L2)  / dst.size().area();
+
+    EXPECT_LT(normInf, 19.1); // for 512x512 case
+    EXPECT_LT(normL2, 18.0 / 256.0 );
+}
+
+TEST_P(FFTExtTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+    //cv::Mat src = imread(cvtest::findDataFile("cv/shared/lena.png"), IMREAD_GRAYSCALE);
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat fwd, back;
+    cv::fastcv::FFT(src, fwd);
+    cv::fastcv::IFFT(fwd, back);
+    Mat backFloat;
+    back.convertTo(backFloat, CV_32F);
+
+    Mat fwdRef, backRef;
+    cv::dft(srcFloat, fwdRef, DFT_COMPLEX_OUTPUT);
+    cv::idft(fwdRef, backRef, DFT_REAL_OUTPUT);
+
+    backRef *= 1./(src.size().area());
+
+    double normInf = cvtest::norm(backFloat, backRef, cv::NORM_INF);
+    double normL2  = cvtest::norm(backFloat, backRef, cv::NORM_L2)  / src.size().area();
+
+    EXPECT_LT(normInf, 9.16e-05);
+    EXPECT_LT(normL2,  1.228e-06);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, FFTExtTest, ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512),
+                                                                        Size(32, 1), Size(512, 1)));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_fill.cpp b/modules/fastcv/test/test_fill.cpp
new file mode 100644
index 00000000000..31cd6b078dd
--- /dev/null
+++ b/modules/fastcv/test/test_fill.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef tuple<cv::Size /*imgSize*/, int /*nPts*/, int /*channels*/> FillConvexTestParams;
+class FillConvexTest : public ::testing::TestWithParam<FillConvexTestParams> {};
+
+TEST_P(FillConvexTest, randomDraw)
+{
+    auto p = GetParam();
+
+    Size imgSize = std::get<0>(p);
+    int nPts     = std::get<1>(p);
+    int channels = std::get<2>(p);
+
+    cv::RNG rng = cv::theRNG();
+
+    std::vector<Point> allPts, contour;
+    for (int i = 0; i < nPts; i++)
+    {
+        allPts.push_back(Point(rng() % imgSize.width, rng() % imgSize.height));
+    }
+    cv::convexHull(allPts, contour);
+
+    Scalar color(rng() % 256, rng() % 256, rng() % 256);
+
+    Mat imgRef(imgSize, CV_MAKE_TYPE(CV_8U, channels), Scalar(0));
+    Mat imgFast = imgRef.clone();
+
+    cv::fillConvexPoly(imgRef, contour, color);
+    cv::fastcv::fillConvexPoly(imgFast, contour, color);
+
+    double normInf = cvtest::norm(imgRef, imgFast, cv::NORM_INF);
+    double normL2  = cvtest::norm(imgRef, imgFast, cv::NORM_L2);
+
+    EXPECT_EQ(normInf, 0);
+    EXPECT_EQ(normL2, 0);
+}
+
+TEST_P(FillConvexTest, circle)
+{
+    auto p = GetParam();
+
+    Size imgSize = std::get<0>(p);
+    int nPts     = std::get<1>(p);
+    int channels = std::get<2>(p);
+
+    cv::RNG rng = cv::theRNG();
+
+    float r = std::min(imgSize.width, imgSize.height) / 2 * 0.9f;
+    float angle = CV_PI * 2.0f / (float)nPts;
+    std::vector<Point> contour;
+    for (int i = 0; i < nPts; i++)
+    {
+        Point2f pt(r * cos((float)i * angle),
+                   r * sin((float)i * angle));
+        contour.push_back({ imgSize.width  / 2 + int(pt.x),
+                            imgSize.height / 2 + int(pt.y)});
+    }
+    Scalar color(rng() % 256, rng() % 256, rng() % 256);
+
+    Mat imgRef(imgSize, CV_MAKE_TYPE(CV_8U, channels), Scalar(0));
+    Mat imgFast = imgRef.clone();
+
+    cv::fillConvexPoly(imgRef, contour, color);
+    cv::fastcv::fillConvexPoly(imgFast, contour, color);
+
+    double normInf = cvtest::norm(imgRef, imgFast, cv::NORM_INF);
+    double normL2  = cvtest::norm(imgRef, imgFast, cv::NORM_L2);
+
+    EXPECT_EQ(normInf, 0);
+    EXPECT_EQ(normL2, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, FillConvexTest,
+                        ::testing::Combine(testing::Values(Size(640, 480), Size(512, 512), Size(1920, 1080)), // imgSize
+                                           testing::Values(4, 64, 1024), // nPts
+                                           testing::Values(1, 2, 3, 4))); // channels
+
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_hough.cpp b/modules/fastcv/test/test_hough.cpp
new file mode 100644
index 00000000000..31bfca6430c
--- /dev/null
+++ b/modules/fastcv/test/test_hough.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<std::string /* file name */, double /* threshold */ > HoughLinesTestParams;
+class HoughLinesTest : public ::testing::TestWithParam<HoughLinesTestParams> {};
+
+TEST_P(HoughLinesTest, accuracy)
+{
+    auto p = GetParam();
+    std::string fname = std::get<0>(p);
+    double threshold  = std::get<1>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile(fname), cv::IMREAD_GRAYSCALE);
+
+    // make it aligned by 8
+    cv::Mat withBorder;
+    int bpix = ((src.cols & 0xfffffff8) + 8) - src.cols;
+    cv::copyMakeBorder(src, withBorder, 0, 0, 0, bpix, BORDER_REFLECT101);
+    src = withBorder;
+
+    cv::Mat contoured;
+    cv::Canny(src, contoured, 100, 200);
+
+    std::vector<cv::Vec4f> lines;
+    cv::fastcv::houghLines(contoured, lines, threshold);
+
+    std::vector<cv::Vec4f> refLines;
+    double rho = 1.0, theta = 1.0 * CV_PI / 180.0;
+    // cloned since image may be modified by the function
+    cv::HoughLinesP(contoured.clone(), refLines, rho, theta, threshold);
+
+    for (const cv::Vec4f& l : lines)
+    {
+        cv::Point2f from(l[0], l[1]), to(l[2], l[3]);
+        EXPECT_GE(from.x, 0);
+        EXPECT_GE(from.y, 0);
+        EXPECT_LE(from.x, src.cols);
+        EXPECT_LE(from.y, src.rows);
+        EXPECT_GE(to.x, 0);
+        EXPECT_GE(to.y, 0);
+        EXPECT_LE(to.x, src.cols);
+        EXPECT_LE(to.y, src.rows);
+    }
+
+    auto makeDistTrans = [src](const std::vector<Vec4f>& ls) -> cv::Mat
+    {
+        Mat lineMap(src.size(), CV_8U, Scalar(255));
+        for (const cv::Vec4f& l : ls)
+        {
+            cv::Point from(l[0], l[1]), to(l[2], l[3]);
+            cv::line(lineMap, from, to, Scalar::all(0));
+        }
+        Mat distTrans(src.size(), CV_8U);
+        cv::distanceTransform(lineMap, distTrans, DIST_L2, DIST_MASK_PRECISE);
+        return distTrans;
+    };
+
+    cv::Mat distTrans = makeDistTrans(lines);
+    cv::Mat refDistTrans = makeDistTrans(refLines);
+
+    double normInf = cvtest::norm(refDistTrans, distTrans, cv::NORM_INF);
+    double normL2  = cvtest::norm(refDistTrans, distTrans, cv::NORM_L2)  / src.size().area();
+
+    EXPECT_LT(normInf, 120.0);
+    EXPECT_LT(normL2, 0.0361);
+
+    if (cvtest::debugLevel > 0)
+    {
+        cv::Mat draw;
+        cvtColor(src, draw, COLOR_GRAY2BGR);
+        cv::Mat refDraw = draw.clone();
+
+        for (const cv::Vec4f& l : lines)
+        {
+            cv::Point from(l[0], l[1]), to(l[2], l[3]);
+            cv::line(draw, from, to, Scalar(0, 255, 0));
+        }
+        size_t idx = fname.find_last_of("/\\");
+        std::string fout = fname.substr(idx+1, fname.length() - idx - 5);
+        cv::imwrite(cv::format("line_%s_t%5f_fcv.png", fout.c_str(), threshold), draw);
+
+        for (const cv::Vec4f& l : refLines)
+        {
+            cv::Point from(l[0], l[1]), to(l[2], l[3]);
+            cv::line(refDraw, from, to, Scalar(0, 255, 0));
+        }
+        cv::imwrite(cv::format("line_%s_t%5f_ref.png", fout.c_str(), threshold), refDraw);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, HoughLinesTest,
+                        ::testing::Combine(::testing::Values("cv/shared/pic5.png",
+                                                             "stitching/a1.png",
+                                                             "cv/shared/pic5.png",
+                                                             "cv/shared/pic1.png"), // images
+                                           ::testing::Values(0.05, 0.25, 0.5, 0.75) // threshold
+                                           ));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_main.cpp b/modules/fastcv/test/test_main.cpp
new file mode 100644
index 00000000000..cc60576e96f
--- /dev/null
+++ b/modules/fastcv/test/test_main.cpp
@@ -0,0 +1,8 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+CV_TEST_MAIN("")
diff --git a/modules/fastcv/test/test_moments.cpp b/modules/fastcv/test/test_moments.cpp
new file mode 100644
index 00000000000..1d23156dcf2
--- /dev/null
+++ b/modules/fastcv/test/test_moments.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "opencv2/ts.hpp"
+#include "opencv2/fastcv/moments.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<bool,Size,int>> fcv_momentsTest;
+
+TEST_P(fcv_momentsTest, accuracy)
+{
+    const bool binaryImage = get<0>(GetParam());
+    const Size srcSize = get<1>(GetParam());
+    const MatDepth srcType = get<2>(GetParam());
+    Mat src(srcSize, srcType);
+
+	for(int j = 0; j < srcSize.width; ++j)
+        for(int i = 0; i < srcSize.height; ++i)
+		{
+			if(srcType == CV_8UC1)
+				src.at<uchar>(i, j) = cv::randu<uchar>();
+			else if(srcType == CV_32SC1)
+				src.at<int>(i, j) = cv::randu<int>();
+			else if(srcType == CV_32FC1)
+				src.at<float>(i, j) = cv::randu<float>();
+	    }
+
+	cv::Moments m = cv::fastcv::moments(src, binaryImage);
+
+    int len_m = sizeof(m)/sizeof(m.m00);
+    EXPECT_FALSE(len_m != 24);
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, fcv_momentsTest, Combine(
+                   Values(false, true),
+                   Values(TYPICAL_MAT_SIZES),
+                   Values(CV_8UC1, CV_32SC1, CV_32FC1)			   
+));
+
+}
+}
diff --git a/modules/fastcv/test/test_mser.cpp b/modules/fastcv/test/test_mser.cpp
new file mode 100644
index 00000000000..ebacbad32f3
--- /dev/null
+++ b/modules/fastcv/test/test_mser.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+// we use such nested structure to combine test values
+typedef std::tuple< std::tuple<bool /* useBboxes */, bool /* useContourData */>,
+                    int  /* numNeighbors */, std::string /*file path*/> MSERTestParams;
+class MSERTest : public ::testing::TestWithParam<MSERTestParams> {};
+
+// compare results to OpenCV's MSER detector
+// by comparing resulting contours
+TEST_P(MSERTest, accuracy)
+{
+    auto p = GetParam();
+    bool useBboxes      = std::get<0>(std::get<0>(p));
+    bool useContourData = std::get<1>(std::get<0>(p));
+    int  numNeighbors   =             std::get<1>(p); // 4 or 8
+    std::string imgPath =             std::get<2>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
+
+    unsigned int delta = 2;
+    unsigned int minArea = 256;
+    unsigned int maxArea = (int)src.total()/4;
+    float        maxVariation = 0.15f;
+    float        minDiversity = 0.2f;
+
+    std::vector<std::vector<Point>> contours;
+    std::vector<cv::Rect> bboxes;
+    std::vector<cv::fastcv::ContourData> contourData;
+    if (useBboxes)
+    {
+        if (useContourData)
+        {
+            cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
+                             delta, minArea, maxArea, maxVariation, minDiversity);
+        }
+        else
+        {
+            cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
+                             delta, minArea, maxArea, maxVariation, minDiversity);
+        }
+    }
+    else
+    {
+        cv::fastcv::MSER(src, contours, numNeighbors,
+                         delta, minArea, maxArea, maxVariation, minDiversity);
+    }
+
+    Rect imgRect(0, 0, src.cols, src.rows);
+    if (useBboxes)
+    {
+        ASSERT_EQ(contours.size(), bboxes.size());
+        for (size_t i = 0; i < contours.size(); i++)
+        {
+            ASSERT_TRUE(imgRect.contains(bboxes[i].tl()));
+            ASSERT_TRUE(imgRect.contains(bboxes[i].br()));
+
+            for (size_t j = 0; j < contours[i].size(); j++)
+            {
+                ASSERT_TRUE(bboxes[i].contains(contours[i][j]));
+            }
+        }
+    }
+
+    if (useContourData)
+    {
+        ASSERT_EQ(contours.size(), contourData.size());
+        for (size_t i = 0; i < contours.size(); i++)
+        {
+            int polarity = contourData[i].polarity;
+            EXPECT_TRUE(polarity == -1 || polarity == 1);
+        }
+    }
+
+    // compare each pair of contours using dist transform of their points
+    // find pair of contours by similar moments
+    typedef cv::Matx<double, 10, 1> MomentVec;
+
+    auto calcEstimate = [](const std::vector<std::vector<Point>>& contours, Size srcSize) -> std::vector<std::pair<Mat, MomentVec>>
+    {
+        std::vector<std::pair<Mat, MomentVec>> res;
+        for (size_t i = 0; i < contours.size(); i++)
+        {
+            const std::vector<Point>& contour = contours[i];
+            Mat ptsMap(srcSize, CV_8U, Scalar(255));
+            for(size_t j = 0; j < contour.size(); ++j)
+            {
+                ptsMap.at<uchar>(contour[j].y, contour[j].x) = 0;
+            }
+            Mat distTrans(srcSize, CV_8U);
+            cv::distanceTransform(ptsMap, distTrans, DIST_L2, DIST_MASK_PRECISE);
+
+            cv::Moments m = cv::moments(contour);
+            double invRows = 1.0 / srcSize.height,       invCols = 1.0 / srcSize.width;
+            double invRows2 = invRows  / srcSize.height, invCols2 = invCols  / srcSize.width;
+            double invRows3 = invRows2 / srcSize.height, invCols3 = invCols2 / srcSize.width;
+            MomentVec mx  = { m.m00, m.m10 * invCols, m.m01 * invRows,
+                              m.m20 * invCols2, m.m11 * invCols * invRows, m.m02 * invRows2,
+                              m.m30 * invCols3,
+                              m.m21 * invCols2 * invRows,
+                              m.m12 * invCols * invRows2,
+                              m.m03 * invRows3};
+            res.push_back({distTrans, mx});
+        }
+
+        return res;
+    };
+
+    std::vector<std::pair<Mat, MomentVec>> contourEstimate = calcEstimate(contours, src.size());
+
+    std::vector<std::vector<Point>> ocvContours;
+    std::vector<cv::Rect> ocvBboxes;
+
+    cv::Ptr<MSER> ocvMser = cv::MSER::create(delta, minArea, maxArea, maxVariation, minDiversity);
+    ocvMser->detectRegions(src, ocvContours, ocvBboxes);
+
+    std::vector<std::pair<Mat, MomentVec>> ocvContourEstimate = calcEstimate(ocvContours, src.size());
+
+    // brute force match by moments comparison
+    double overallL2Sqr = 0;
+    int nInliers = 0;
+    for (size_t i = 0; i < contourEstimate.size(); i++)
+    {
+        double minDist = std::numeric_limits<double>::max();
+        size_t minIdx = -1;
+        for (size_t j = 0; j < ocvContourEstimate.size(); j++)
+        {
+            double d = cv::norm(contourEstimate[i].second - ocvContourEstimate[j].second);
+            if (d < minDist)
+            {
+                minDist = d; minIdx = j;
+            }
+        }
+        // compare dist transforms of contours
+        Mat ref = ocvContourEstimate[minIdx].first;
+        Mat fcv = contourEstimate[i].first;
+        double normL2Sqr  = cvtest::norm(ref, fcv, cv::NORM_L2SQR);
+        double normInf    = cvtest::norm(ref, fcv, cv::NORM_INF);
+        normL2Sqr = normL2Sqr / src.size().area();
+
+        if (cvtest::debugLevel > 0)
+        {
+            Mat draw(src.rows, src.cols*2, CV_8U);
+            ref.copyTo(draw(Range::all(), Range(0, src.cols)));
+            fcv.copyTo(draw(Range::all(), Range(src.cols, src.cols*2)));
+            cv::putText(draw, cv::format("dM: %f L2^2: %f Inf: %f",minDist, normL2Sqr, normInf), Point(0, src.rows),
+                        cv::FONT_HERSHEY_COMPLEX, 1, Scalar::all(128));
+            cv::imwrite(cv::format("dist_n%d_c%03d_r%03d.png", numNeighbors, (int)i, (int)minIdx), draw);
+        }
+
+        if (normInf < 50.0)
+        {
+            overallL2Sqr += normL2Sqr;
+            nInliers++;
+        }
+    }
+
+    double overallL2 = std::sqrt(overallL2Sqr);
+    EXPECT_LT(std::sqrt(overallL2), 11.45);
+    double ratioInliers = double(nInliers) / contourEstimate.size();
+    EXPECT_GT(ratioInliers, 0.363);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, MSERTest,
+    ::testing::Combine(::testing::Values( // useBboxes useContourData
+                                         std::tuple<bool, bool> { true, false},
+                                         std::tuple<bool, bool> {false, false},
+                                         std::tuple<bool, bool> { true,  true}),
+                       ::testing::Values(4, 8), // numNeighbors
+                       ::testing::Values("cv/shared/baboon.png", "cv/mser/puzzle.png")
+                      )
+    );
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_precomp.hpp b/modules/fastcv/test/test_precomp.hpp
new file mode 100644
index 00000000000..1b4c23eca30
--- /dev/null
+++ b/modules/fastcv/test/test_precomp.hpp
@@ -0,0 +1,10 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include <opencv2/ts.hpp>
+#include <opencv2/features2d.hpp>
+#include <opencv2/video.hpp>
+
+#include <opencv2/fastcv.hpp>
diff --git a/modules/fastcv/test/test_remap.cpp b/modules/fastcv/test/test_remap.cpp
new file mode 100644
index 00000000000..6fa5ccdabfd
--- /dev/null
+++ b/modules/fastcv/test/test_remap.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "opencv2/ts.hpp"
+#include "opencv2/fastcv/remap.hpp"
+
+namespace opencv_test { namespace {
+
+class RemapTest : public ::testing::TestWithParam<tuple<int, int, Size>> {
+protected:
+    void SetUp() override {
+        // Generate random source data
+        Size size = get<2>(GetParam());
+        src = Mat(size, get<0>(GetParam()));
+        randu(src, Scalar::all(0), Scalar::all(255)); // Fill with random values
+
+        ASSERT_FALSE(src.empty()) << "Unable to generate the image!";
+
+        // Create map matrices
+        map_x.create(src.size(), CV_32FC1);
+        map_y.create(src.size(), CV_32FC1);
+
+        // Initialize the map matrices
+        for (int i = 0; i < src.rows; i++) {
+            for (int j = 0; j < src.cols; j++) {
+                map_x.at<float>(i, j) = static_cast<float>(src.cols - j); //Flips the image horizonally
+                map_y.at<float>(i, j) = static_cast<float>(i); //Keep y coordinate unchanged
+            }
+        }
+    }
+
+    Mat src, map_x, map_y, dst;
+};
+
+class RemapTestRGBA : public ::testing::TestWithParam<tuple<int, int, Size>> {
+protected:
+    void SetUp() override {
+        // Generate random source data
+        Size size = get<2>(GetParam());
+        src = Mat(size, get<0>(GetParam()));
+        randu(src, Scalar::all(0), Scalar::all(255)); // Fill with random values
+
+        ASSERT_FALSE(src.empty()) << "Unable to generate the image!";
+
+        // Create map matrices
+        map_x.create(src.size(), CV_32FC1);
+        map_y.create(src.size(), CV_32FC1);
+
+        // Initialize the map matrices
+        for (int i = 0; i < src.rows; i++) {
+            for (int j = 0; j < src.cols; j++) {
+                map_x.at<float>(i, j) = static_cast<float>(src.cols - j); //Flips the image horizonally
+                map_y.at<float>(i, j) = static_cast<float>(i); //Keep y coordinate unchanged
+            }
+        }
+    }
+
+    Mat src, map_x, map_y, dst;
+};
+
+TEST_P(RemapTest, accuracy)
+{
+    int type = get<0>(GetParam());
+    int interpolation = get<1>(GetParam());
+
+    // Convert source image to the specified type
+    Mat src_converted;
+    src.convertTo(src_converted, type);
+
+    cv::fastcv::remap(src_converted, dst, map_x, map_y, interpolation);
+
+    // Check if the remapped image is not empty
+    ASSERT_FALSE(dst.empty()) << "Remapped image is empty!";
+
+    cv::Mat remapOpenCV;
+    cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
+
+    cv::Mat diffImage;
+    cv::absdiff(dst, remapOpenCV, diffImage);
+
+    // Calculate the maximum difference
+    double maxVal=0.0;
+    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+
+    // Assert if the difference is acceptable (max difference should be less than 10)
+    CV_Assert(maxVal < 10 && "Difference between images is too high!");
+}
+
+TEST_P(RemapTestRGBA, accuracy)
+{
+    int type = get<0>(GetParam());
+    int interpolation = get<1>(GetParam());
+
+    // Convert source image to the specified type
+    Mat src_converted;
+    src.convertTo(src_converted, type);
+
+    cv::fastcv::remapRGBA(src_converted, dst, map_x, map_y, interpolation);
+
+    // Check if the remapped image is not empty
+    ASSERT_FALSE(dst.empty()) << "Remapped image is empty!";
+
+    cv::Mat remapOpenCV;
+    cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
+
+    cv::Mat diffImage;
+    cv::absdiff(dst, remapOpenCV, diffImage);
+
+    // Calculate the maximum difference
+    double maxVal=0.0;
+    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+
+    // Assert if the difference is acceptable (max difference should be less than 10)
+    CV_Assert(maxVal < 10 && "Difference between images is too high!");
+}
+
+
+INSTANTIATE_TEST_CASE_P(
+    RemapTests,
+    RemapTest,
+    ::testing::Combine(
+        ::testing::Values(CV_8UC1),
+        ::testing::Values(INTER_LINEAR, INTER_NEAREST),
+        ::testing::Values(Size(640, 480), Size(1280, 720), Size(1920, 1080))
+    )
+);
+
+INSTANTIATE_TEST_CASE_P(
+    RemapTests,
+    RemapTestRGBA,
+    ::testing::Combine(
+        ::testing::Values(CV_8UC4),
+        ::testing::Values(INTER_LINEAR, INTER_NEAREST),
+        ::testing::Values(Size(640, 480), Size(1280, 720), Size(1920, 1080))
+    )
+);
+
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_scale.cpp b/modules/fastcv/test/test_scale.cpp
new file mode 100644
index 00000000000..394fd907cc9
--- /dev/null
+++ b/modules/fastcv/test/test_scale.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "opencv2/ts.hpp"
+#include "opencv2/fastcv/scale.hpp"
+
+namespace opencv_test { namespace {
+
+class ResizeBy2Test : public ::testing::TestWithParam<cv::Size> {};
+class ResizeBy4Test : public ::testing::TestWithParam<cv::Size> {};
+
+TEST(resizeDownBy2, accuracy)
+{
+    cv::Mat inputImage = cv::imread(cvtest::findDataFile("cv/shared/box_in_scene.png"), cv::IMREAD_GRAYSCALE);
+
+    Size dsize;
+    cv::Mat resized_image;
+
+    cv::fastcv::resizeDownBy2(inputImage, resized_image);
+
+    EXPECT_FALSE(resized_image.empty());
+
+    cv::Mat resizedImageOpenCV;
+    cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 2, inputImage.rows / 2), 0, 0, INTER_AREA);
+
+    cv::Mat diffImage;
+    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
+
+    // Calculate the maximum difference
+    double maxVal=0.0;
+    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+
+    // Assert if the difference is acceptable (max difference should be less than 10)
+    CV_Assert(maxVal < 10 && "Difference between images is too high!");
+}
+
+TEST(resizeDownBy4, accuracy)
+{
+    cv::Mat inputImage = cv::imread(cvtest::findDataFile("cv/shared/box_in_scene.png"), cv::IMREAD_GRAYSCALE);
+
+    Size dsize;
+    cv::Mat resized_image;
+
+    cv::fastcv::resizeDownBy4(inputImage, resized_image);
+
+    EXPECT_FALSE(resized_image.empty());
+
+    cv::Mat resizedImageOpenCV;
+    cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 4, inputImage.rows / 4), 0, 0, INTER_AREA);
+
+    cv::Mat diffImage;
+    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
+
+    // Calculate the maximum difference
+    double maxVal=0.0;
+    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+
+    // Assert if the difference is acceptable (max difference should be less than 10)
+    CV_Assert(maxVal < 10 && "Difference between images is too high!");
+}
+
+TEST_P(ResizeBy2Test, ResizeBy2) {
+
+    //Size size = get<0>(GetParam());
+    Size size = GetParam();
+    cv::Mat inputImage(size, CV_8UC1);
+    randu(inputImage, Scalar::all(0), Scalar::all(255)); // Fill with random values
+
+    Size dsize;
+    cv::Mat resized_image;
+
+    // Resize the image by a factor of 2
+    cv::fastcv::resizeDownBy2(inputImage, resized_image);
+
+    // Check if the output size is correct
+    EXPECT_EQ(resized_image.size().width, size.width * 0.5);
+    EXPECT_EQ(resized_image.size().height, size.height * 0.5);
+}
+
+TEST_P(ResizeBy4Test, ResizeBy2) {
+
+    //Size size = get<0>(GetParam());
+    Size size = GetParam();
+    cv::Mat inputImage(size, CV_8UC1);
+    randu(inputImage, Scalar::all(0), Scalar::all(255)); // Fill with random values
+
+    Size dsize;
+    cv::Mat resized_image;
+
+    // Resize the image by a factor of 2
+    cv::fastcv::resizeDownBy4(inputImage, resized_image);
+
+    // Check if the output size is correct
+    EXPECT_EQ(resized_image.size().width, size.width * 0.25);
+    EXPECT_EQ(resized_image.size().height, size.height * 0.25);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ResizeTests, 
+    ResizeBy2Test, 
+    ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
+));
+
+INSTANTIATE_TEST_CASE_P(
+    ResizeTests, 
+    ResizeBy4Test, 
+    ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
+));
+
+
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_shift.cpp b/modules/fastcv/test/test_shift.cpp
new file mode 100644
index 00000000000..1473f91d553
--- /dev/null
+++ b/modules/fastcv/test/test_shift.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<cv::Size, MatType, int /*iterations*/, float /*epsilon*/, Size /*winSize*/> MeanShiftTestParams;
+class MeanShiftTest : public ::testing::TestWithParam<MeanShiftTestParams> {};
+
+TEST_P(MeanShiftTest, accuracy)
+{
+    auto p = GetParam();
+    cv::Size size = std::get<0>(p);
+    MatType type  = std::get<1>(p);
+    int iters     = std::get<2>(p);
+    float eps     = std::get<3>(p);
+    Size winSize  = std::get<4>(p);
+
+    RNG& rng = cv::theRNG();
+
+    const int nPts = 20;
+    Mat ptsMap(size, CV_8UC1, Scalar(255));
+    for(size_t i = 0; i < nPts; ++i)
+    {
+        ptsMap.at<uchar>(rng() % size.height, rng() % size.width) = 0;
+    }
+    Mat distTrans(size, CV_8UC1);
+    cv::distanceTransform(ptsMap, distTrans, DIST_L2, DIST_MASK_PRECISE);
+    Mat vsrc = 255 - distTrans;
+    Mat src;
+    vsrc.convertTo(src, type);
+
+    Point startPt(rng() % (size.width  - winSize.width),
+                  rng() % (size.height - winSize.height));
+    Rect startRect(startPt, winSize);
+
+    cv::TermCriteria termCrit( TermCriteria::EPS + TermCriteria::MAX_ITER, iters, eps);
+
+    Rect window = startRect;
+    cv::fastcv::meanShift(src, window, termCrit);
+
+    Rect windowRef = startRect;
+    cv::meanShift(vsrc, windowRef, termCrit);
+
+    if (cvtest::debugLevel > 0)
+    {
+        Mat draw;
+        cvtColor(vsrc, draw, COLOR_GRAY2RGB);
+        cv::rectangle(draw, startRect, Scalar(0, 0, 255));
+        cv::rectangle(draw, window, Scalar(255, 255, 0));
+        cv::rectangle(draw, windowRef, Scalar(0, 255, 0));
+        std::string stype = (type == CV_8U ? "8U" : (type == CV_32S ? "32S" : (type == CV_32F ? "F" : "?")));
+        cv::imwrite(cv::format("src_%dx%d_%s_%dit_%feps_%dx%d.png", size.width, size.height, stype.c_str(),
+                                                                    iters, eps, winSize.width, winSize.height),
+                    draw);
+    }
+
+    cv::Point diff = (window.tl() - windowRef.tl());
+    double dist = std::sqrt(diff.ddot(diff));
+
+    EXPECT_LE(dist, 3.0);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, MeanShiftTest,
+                         ::testing::Combine(::testing::Values(Size(128, 128), Size(640, 480), Size(800, 600)),
+                                            ::testing::Values(CV_8U, CV_32S, CV_32F), // type
+                                            ::testing::Values(2, 10, 100), // nIterations
+                                            ::testing::Values(0.01f, 0.1f, 1.f, 10.f), // epsilon
+                                            ::testing::Values(Size(8, 8), Size(13, 48), Size(64, 64)) // window size
+                                            ));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_smooth.cpp b/modules/fastcv/test/test_smooth.cpp
new file mode 100644
index 00000000000..0b73baa5cd5
--- /dev/null
+++ b/modules/fastcv/test/test_smooth.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<float, float> BilateralTestParams;
+class BilateralRecursiveTest : public ::testing::TestWithParam<BilateralTestParams> {};
+
+TEST_P(BilateralRecursiveTest, accuracy)
+{
+    auto p = GetParam();
+    float sigmaColor = std::get<0>(p);
+    float sigmaSpace = std::get<1>(p);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    Mat dst;
+    cv::fastcv::bilateralRecursive(src, dst, sigmaColor, sigmaSpace);
+
+    // NOTE: test files should be manually loaded to folder on a device, for example like this:
+    // adb push fastcv/misc/bilateral_recursive/ /sdcard/testdata/fastcv/bilateral/
+    cv::Mat ref = imread(cvtest::findDataFile(cv::format("fastcv/bilateral/rec_%2f_%2f.png", sigmaColor, sigmaSpace)),
+                         IMREAD_GRAYSCALE);
+
+    if (cvtest::debugLevel > 0)
+    {
+        cv::imwrite(cv::format("rec_%2f_%2f.png", sigmaColor, sigmaSpace), dst);
+    }
+
+    double normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+    double normL2  = cvtest::norm(dst, ref, cv::NORM_L2);
+
+    ASSERT_LT(normInf, 1);
+    ASSERT_LT(normL2, 1.f / src.size().area());
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, BilateralRecursiveTest,
+                        ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
+                                           ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f)));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_thresh.cpp b/modules/fastcv/test/test_thresh.cpp
new file mode 100644
index 00000000000..b56c784179b
--- /dev/null
+++ b/modules/fastcv/test/test_thresh.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<cv::Size, int /*lowThresh*/, int /*highThresh*/, int /*trueValue*/, int /*falseValue*/> ThresholdRangeTestParams;
+class ThresholdRangeTest : public ::testing::TestWithParam<ThresholdRangeTestParams> {};
+
+TEST_P(ThresholdRangeTest, accuracy)
+{
+    auto p = GetParam();
+    cv::Size size  = std::get<0>(p);
+    int loThresh  = std::get<1>(p);
+    int hiThresh = std::get<2>(p);
+    int trueValue  = std::get<3>(p);
+    int falseValue = std::get<4>(p);
+
+    int lowThresh  = std::min(loThresh, hiThresh);
+    int highThresh = std::max(loThresh, hiThresh);
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat dst;
+    cv::fastcv::thresholdRange(src, dst, lowThresh, highThresh, trueValue, falseValue);
+
+    Mat inr, ref(src.size(), CV_8UC1);
+    cv::inRange(src, lowThresh, highThresh, inr);
+    ref.setTo(trueValue, inr);
+    ref.setTo(falseValue, ~inr);
+
+    double normInf = cvtest::norm(ref, dst, cv::NORM_INF);
+
+    EXPECT_EQ(normInf, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, ThresholdRangeTest,
+                         ::testing::Combine(::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
+                                            ::testing::Values(0, 15, 128, 255), // lowThresh
+                                            ::testing::Values(0, 15, 128, 255), // highThresh
+                                            ::testing::Values(0, 15, 128, 255), // trueValue
+                                            ::testing::Values(0, 15, 128, 255)  // falseValue
+                                            ));
+
+}} // namespaces opencv_test, ::

From 2e27b94092c442195815f35e591f46803c962ff9 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Mon, 2 Dec 2024 13:40:39 +0300
Subject: [PATCH 03/14] Warnings fix in FastCV module documentation.

---
 modules/fastcv/include/opencv2/fastcv.hpp        | 10 +++++-----
 modules/fastcv/include/opencv2/fastcv/arithm.hpp |  1 +
 modules/fastcv/include/opencv2/fastcv/scale.hpp  | 10 +++++-----
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
index 4248a674076..fcf0bf132fb 100644
--- a/modules/fastcv/include/opencv2/fastcv.hpp
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -8,10 +8,6 @@
 
 #include <opencv2/core.hpp>
 
-/**
- * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
- */
-
 #include "opencv2/fastcv/arithm.hpp"
 #include "opencv2/fastcv/bilateralFilter.hpp"
 #include "opencv2/fastcv/cluster.hpp"
@@ -27,6 +23,10 @@
 #include "opencv2/fastcv/smooth.hpp"
 #include "opencv2/fastcv/thresh.hpp"
 
-//! @}
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ * @{
+ * @}
+ */
 
 #endif // OPENCV_FASTCV_ARITHM_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/arithm.hpp b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
index e90079946be..e479d970b1d 100644
--- a/modules/fastcv/include/opencv2/fastcv/arithm.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
@@ -22,6 +22,7 @@ namespace fastcv {
  * @param dst Resulting matrix of type CV_32S
  */
 CV_EXPORTS_W void matmuls8s32(InputArray src1, InputArray src2, OutputArray dst);
+
 //! @}
 
 } // fastcv::
diff --git a/modules/fastcv/include/opencv2/fastcv/scale.hpp b/modules/fastcv/include/opencv2/fastcv/scale.hpp
index e499f6f3b7d..8d7d084ac24 100644
--- a/modules/fastcv/include/opencv2/fastcv/scale.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/scale.hpp
@@ -16,15 +16,15 @@ namespace fastcv {
 
 /**
  * @brief Down-scale the image by averaging each 2x2 pixel block.
- * @param src The first input image data, type CV_8UC1, src height must be a multiple of 2
- * @param dst The output image data, type CV_8UC1
+ * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 2
+ * @param _dst The output image data, type CV_8UC1
 */
 CV_EXPORTS_W void resizeDownBy2(cv::InputArray _src, cv::OutputArray _dst);
 
 /**
  * @brief Down-scale the image by averaging each 4x4 pixel block.
- * @param src The first input image data, type CV_8UC1, src height must be a multiple of 4
- * @param dst The output image data, type CV_8UC1
+ * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 4
+ * @param _dst The output image data, type CV_8UC1
 */
 CV_EXPORTS_W void resizeDownBy4(cv::InputArray _src, cv::OutputArray _dst);
 
@@ -33,4 +33,4 @@ CV_EXPORTS_W void resizeDownBy4(cv::InputArray _src, cv::OutputArray _dst);
 } // fastcv::
 } // cv::
 
-#endif // OPENCV_FASTCV_SCALE_HPP
\ No newline at end of file
+#endif // OPENCV_FASTCV_SCALE_HPP

From 97637fad1a4413fea49e0ba869efbff12cd6f7c2 Mon Sep 17 00:00:00 2001
From: Kumataro <Kumataro@users.noreply.github.com>
Date: Sun, 8 Dec 2024 15:07:24 +0900
Subject: [PATCH 04/14] support C++20 standard

- cast to calicurate mixing different enums
- initilize member variables
---
 modules/text/src/erfilter.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/text/src/erfilter.cpp b/modules/text/src/erfilter.cpp
index 8af72b43cf5..0a2fb6acbda 100644
--- a/modules/text/src/erfilter.cpp
+++ b/modules/text/src/erfilter.cpp
@@ -1038,7 +1038,7 @@ double ERClassifierNM1::eval(const ERStat& stat)
                      (float)(1-stat.euler), //number of holes
                      stat.med_crossings);
 
-    float votes = boost->predict( sample, noArray(), DTrees::PREDICT_SUM | StatModel::RAW_OUTPUT);
+    float votes = boost->predict( sample, noArray(), (int)DTrees::PREDICT_SUM | (int)StatModel::RAW_OUTPUT);
 
     // Logistic Correction returns a probability value (in the range(0,1))
     return (double)1-(double)1/(1+exp(-2*votes));
@@ -1070,7 +1070,7 @@ double ERClassifierNM2::eval(const ERStat& stat)
                      stat.med_crossings, stat.hole_area_ratio,
                      stat.convex_hull_ratio, stat.num_inflexion_points);
 
-    float votes = boost->predict( sample, noArray(), DTrees::PREDICT_SUM | StatModel::RAW_OUTPUT);
+    float votes = boost->predict( sample, noArray(), (int)DTrees::PREDICT_SUM | (int)StatModel::RAW_OUTPUT);
 
     // Logistic Correction returns a probability value (in the range(0,1))
     return (double)1-(double)1/(1+exp(-2*votes));
@@ -2152,6 +2152,11 @@ void MaxMeaningfulClustering::build_merge_info(double *Z, double *X, int N, int
     {
         HCluster cluster;
         cluster.num_elem = (int)Z[i+3]; //number of elements
+        cluster.nfa = 0;
+        cluster.dist_ext = 0.0f;
+        cluster.max_meaningful = false;
+        cluster.min_nfa_in_branch = 0;
+        cluster.probability = 0.0;
 
         int node1  = (int)Z[i];
         int node2  = (int)Z[i+1];
@@ -2611,7 +2616,7 @@ double MaxMeaningfulClustering::probability(vector<int> &cluster)
     sample.push_back((float)mean[0]);
     sample.push_back((float)std[0]);
 
-    float votes_group = group_boost->predict( Mat(sample), noArray(), DTrees::PREDICT_SUM | StatModel::RAW_OUTPUT);
+    float votes_group = group_boost->predict( Mat(sample), noArray(), (int)DTrees::PREDICT_SUM | (int)StatModel::RAW_OUTPUT);
 
     return (double)1-(double)1/(1+exp(-2*votes_group));
 }

From c0d10878d4ff2677f82bf70d3b5afa115c4a48cb Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@google.com>
Date: Tue, 10 Dec 2024 11:37:27 +0100
Subject: [PATCH 05/14] Get cudalegacy to compile with clang CUDA and without
 CUDA

---
 .../cudalegacy/include/opencv2/cudalegacy.hpp |  2 ++
 modules/cudalegacy/src/NCV.cpp                |  3 ++
 .../cudalegacy/src/cuda/NCVBroxOpticalFlow.cu |  2 --
 .../src/cuda/NCVHaarObjectDetection.cu        |  4 +--
 .../src/cuda/NCVPixelOperations.hpp           |  4 +--
 modules/cudalegacy/src/cuda/NPP_staging.cu    | 32 +++++++++----------
 modules/cudalegacy/src/cuda/needle_map.cu     |  6 ++--
 modules/cudalegacy/src/precomp.hpp            |  2 ++
 8 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/modules/cudalegacy/include/opencv2/cudalegacy.hpp b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
index ace8548e35d..8230eaa2171 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -44,11 +44,13 @@
 #define OPENCV_CUDALEGACY_HPP
 
 #include "opencv2/core/cuda.hpp"
+#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVPyramid.hpp"
 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
+#endif
 #include "opencv2/video/background_segm.hpp"
 
 /**
diff --git a/modules/cudalegacy/src/NCV.cpp b/modules/cudalegacy/src/NCV.cpp
index ddb7003fad0..bad767d0c35 100644
--- a/modules/cudalegacy/src/NCV.cpp
+++ b/modules/cudalegacy/src/NCV.cpp
@@ -42,6 +42,8 @@
 
 #include "precomp.hpp"
 
+#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
+
 //==============================================================================
 //
 // Error handling helpers
@@ -886,3 +888,4 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
 {
     return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
 }
+#endif
diff --git a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
index 3a527a010c3..d37ed9850f1 100644
--- a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
+++ b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
@@ -695,8 +695,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
     //prepare image pyramid
     ImagePyramid pyr(desc.number_of_outer_iterations);
 
-    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
-
     float scale = 1.0f;
 
     //cuda arrays for frames
diff --git a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
index 9760bcee523..b2a20798b64 100644
--- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
+++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@@ -193,7 +193,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>
             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
         }
 
-        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
+        if (!tbDoAtomicCompaction || (tbDoAtomicCompaction && !bInactiveThread))
         {
             outMaskVal = d_inMask[maskOffset];
             y_offs = outMaskVal >> 16;
@@ -210,7 +210,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>
             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
         }
 
-        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
+        if (!tbDoAtomicCompaction || (tbDoAtomicCompaction && !bInactiveThread))
         {
             maskOffset = y_offs * mask2Dstride + x_offs;
 
diff --git a/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp b/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
index 3d570c5faac..fcebf576d98 100644
--- a/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
+++ b/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
@@ -84,7 +84,7 @@ template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
 template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
 template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
 
-#define NC(T)       (sizeof(T) / sizeof(TConvVec2Base<T>::TBase))
+#define NC(T)       (sizeof(T) / sizeof(typename TConvVec2Base<T>::TBase))
 
 template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
 template<> struct TConvBase2Vec<Ncv8u, 1>  {typedef uchar1 TVec;};
@@ -115,7 +115,7 @@ template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, N
 template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
 template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
 
-template<typename Tout> inline Tout _pixMakeZero();
+template<typename Tout> inline __host__ __device__ Tout _pixMakeZero();
 template<> inline __host__ __device__ uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
 template<> inline __host__ __device__ uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
 template<> inline __host__ __device__ uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
diff --git a/modules/cudalegacy/src/cuda/NPP_staging.cu b/modules/cudalegacy/src/cuda/NPP_staging.cu
index b7a24ee0360..36df5645a62 100644
--- a/modules/cudalegacy/src/cuda/NPP_staging.cu
+++ b/modules/cudalegacy/src/cuda/NPP_staging.cu
@@ -85,26 +85,24 @@ const Ncv32u NUM_SCAN_THREADS = 256;
 const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
 
 
-template<class T_in, class T_out>
+template<class T_in, class T_out, bool tbDoSqr>
 struct _scanElemOp
 {
-    template<bool tbDoSqr>
-    static inline __host__ __device__ T_out scanElemOp(T_in elem)
-    {
-        return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
-    }
-
-private:
-
-    template <int v> struct Int2Type { enum { value = v }; };
+    static __host__ __device__ T_out scanElemOp(T_in elem);
+};
 
-    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
-    {
-        return (T_out)elem;
+template<class T_in, class T_out>
+struct _scanElemOp<T_in, T_out, false>
+{
+    static inline __host__ __device__ T_out scanElemOp(T_in elem) {
+        return (T_out)(elem);
     }
+};
 
-    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
-    {
+template<class T_in, class T_out>
+struct _scanElemOp<T_in, T_out, true>
+{
+    static inline __host__ __device__ T_out scanElemOp(T_in elem) {
         return (T_out)(elem*elem);
     }
 };
@@ -177,7 +175,7 @@ __global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u
         Ncv32u curElemOffs = offsetX + threadIdx.x;
         T_out curScanElem;
 
-        T_in curElem;
+        T_in curElem = 0;
         T_out curElemMod;
 
         if (curElemOffs < srcWidth)
@@ -185,7 +183,7 @@ __global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u
             //load elements
             curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
         }
-        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
+        curElemMod = _scanElemOp<T_in, T_out, tbDoSqr>::scanElemOp(curElem);
 
         //inclusive scan
         curScanElem = cv::cudev::blockScanInclusive<NUM_SCAN_THREADS>(curElemMod, shmem, threadIdx.x);
diff --git a/modules/cudalegacy/src/cuda/needle_map.cu b/modules/cudalegacy/src/cuda/needle_map.cu
index a98b17cafed..c5297281025 100644
--- a/modules/cudalegacy/src/cuda/needle_map.cu
+++ b/modules/cudalegacy/src/cuda/needle_map.cu
@@ -76,19 +76,19 @@ namespace cv { namespace cuda { namespace device
                 // now add the column sums
                 const uint X = threadIdx.x;
 
-                if (X | 0xfe == 0xfe)  // bit 0 is 0
+                if (X | (0xfe == 0xfe))  // bit 0 is 0
                 {
                     u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
                 }
 
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                if (X | (0xfe == 0xfc)) // bits 0 & 1 == 0
                 {
                     u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
                 }
 
-                if (X | 0xf8 == 0xf8)
+                if (X | (0xf8 == 0xf8))
                 {
                     u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
diff --git a/modules/cudalegacy/src/precomp.hpp b/modules/cudalegacy/src/precomp.hpp
index e87cc8620c9..4524e04b9cb 100644
--- a/modules/cudalegacy/src/precomp.hpp
+++ b/modules/cudalegacy/src/precomp.hpp
@@ -80,6 +80,8 @@
 #endif
 
 #include "opencv2/core/private.cuda.hpp"
+#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
 #include "opencv2/cudalegacy/private.hpp"
+#endif
 
 #endif /* __OPENCV_PRECOMP_H__ */

From 8dc76950e689694cdca86a6d7d1b0b08cc23f40a Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Tue, 17 Dec 2024 01:52:10 +0100
Subject: [PATCH 06/14] FastCV bilateral recursive: test files updated

---
 modules/fastcv/test/test_smooth.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/fastcv/test/test_smooth.cpp b/modules/fastcv/test/test_smooth.cpp
index 0b73baa5cd5..47c85152ebf 100644
--- a/modules/fastcv/test/test_smooth.cpp
+++ b/modules/fastcv/test/test_smooth.cpp
@@ -39,7 +39,14 @@ TEST_P(BilateralRecursiveTest, accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(FastCV_Extension, BilateralRecursiveTest,
-                        ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
-                                           ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f)));
+                        ::testing::Values(
+                            BilateralTestParams {0.01f, 1.00f},
+                            BilateralTestParams {0.10f, 0.01f},
+                            BilateralTestParams {1.00f, 0.01f},
+                            BilateralTestParams {1.00f, 1.00f},
+                            BilateralTestParams {5.00f, 0.01f},
+                            BilateralTestParams {5.00f, 0.10f},
+                            BilateralTestParams {5.00f, 5.00f}
+                        ));
 
 }} // namespaces opencv_test, ::

From aad06709916749e509645a460de983332096462b Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@google.com>
Date: Thu, 19 Dec 2024 13:13:29 +0100
Subject: [PATCH 07/14] Get cudalegacy dependents to build without CUDA

---
 modules/cudalegacy/include/opencv2/cudalegacy/private.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
index 79f9e635bf0..fd3286c17eb 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
@@ -50,9 +50,7 @@
 
 #include "opencv2/core/private.cuda.hpp"
 
-#ifndef HAVE_CUDA
-#  error cudalegacy module requires CUDA
-#endif
+#ifdef HAVE_CUDA
 
 #include "opencv2/cudalegacy.hpp"
 
@@ -93,4 +91,6 @@ namespace cv { namespace cuda
 
 //! @endcond
 
+#endif HAVE_CUDA
+
 #endif // OPENCV_CORE_CUDALEGACY_PRIVATE_HPP

From 67815e94c84b9ac168df8b25293378bd5e7578ae Mon Sep 17 00:00:00 2001
From: quic-xuezha <quic_xuezha@quicinc.com>
Date: Fri, 20 Dec 2024 23:13:09 +0800
Subject: [PATCH 08/14] Merge pull request #3844 from CodeLinaro:xuezha_2ndPost

FastCV Extension code for OpenCV 2ndpost-1 #3844

Depends on: [opencv/opencv#26617](https://github.com/opencv/opencv/pull/26617)
Requires binary from [opencv/opencv_3rdparty#90](https://github.com/opencv/opencv_3rdparty/pull/90)

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/fastcv/README.md                      |   3 +-
 modules/fastcv/include/opencv2/fastcv.hpp     |   8 +-
 .../fastcv/include/opencv2/fastcv/arithm.hpp  |   3 +-
 .../fastcv/include/opencv2/fastcv/blur.hpp    |  64 +++
 .../fastcv/include/opencv2/fastcv/cluster.hpp |   3 +-
 .../fastcv/include/opencv2/fastcv/draw.hpp    |   2 +-
 .../fastcv/include/opencv2/fastcv/edges.hpp   |  53 +++
 .../fastcv/include/opencv2/fastcv/fast10.hpp  |   7 +-
 modules/fastcv/include/opencv2/fastcv/fft.hpp |   2 +-
 .../fastcv/include/opencv2/fastcv/hough.hpp   |  23 +-
 .../include/opencv2/fastcv/ipptransform.hpp   |  39 ++
 .../fastcv/include/opencv2/fastcv/moments.hpp |   5 +-
 .../fastcv/include/opencv2/fastcv/mser.hpp    | 181 +++++----
 .../fastcv/include/opencv2/fastcv/pyramid.hpp |  51 +++
 .../fastcv/include/opencv2/fastcv/scale.hpp   |   2 +
 .../fastcv/include/opencv2/fastcv/shift.hpp   |   7 +-
 .../fastcv/include/opencv2/fastcv/smooth.hpp  |   1 +
 .../fastcv/include/opencv2/fastcv/thresh.hpp  |   2 +-
 .../include/opencv2/fastcv/tracking.hpp       |  66 ++++
 .../fastcv/include/opencv2/fastcv/warp.hpp    |  38 ++
 modules/fastcv/perf/perf_bilateral.cpp        |  21 +-
 modules/fastcv/perf/perf_blur.cpp             | 123 ++++++
 modules/fastcv/perf/perf_edges.cpp            |  68 ++++
 modules/fastcv/perf/perf_fft_dct.cpp          | 104 +++++
 modules/fastcv/perf/perf_mser.cpp             |  21 +-
 modules/fastcv/perf/perf_pyramid.cpp          |  76 ++++
 modules/fastcv/perf/perf_tracking.cpp         |  98 +++++
 modules/fastcv/perf/perf_warp.cpp             |  62 +++
 modules/fastcv/src/bilateralFilter.cpp        |  78 ++--
 modules/fastcv/src/blur.cpp                   | 365 ++++++++++++++++++
 modules/fastcv/src/edges.cpp                  | 125 ++++++
 modules/fastcv/src/ipptransform.cpp           |  48 +++
 modules/fastcv/src/mser.cpp                   | 151 +++++---
 modules/fastcv/src/precomp.hpp                |   3 +
 modules/fastcv/src/pyramid.cpp                | 183 +++++++++
 modules/fastcv/src/remap.cpp                  |  20 +-
 modules/fastcv/src/tracking.cpp               | 269 +++++++++++++
 modules/fastcv/src/warp.cpp                   |  76 ++++
 modules/fastcv/test/test_bilateral.cpp        |  12 +-
 modules/fastcv/test/test_blur.cpp             | 129 +++++++
 modules/fastcv/test/test_edges.cpp            |  74 ++++
 modules/fastcv/test/test_fft.cpp              |   1 -
 modules/fastcv/test/test_ipptransform.cpp     |  80 ++++
 modules/fastcv/test/test_moments.cpp          |   5 +-
 modules/fastcv/test/test_mser.cpp             |  20 +-
 modules/fastcv/test/test_precomp.hpp          |   1 +
 modules/fastcv/test/test_pyramid.cpp          | 171 ++++++++
 modules/fastcv/test/test_remap.cpp            |  15 +-
 modules/fastcv/test/test_scale.cpp            |  27 +-
 modules/fastcv/test/test_tracking.cpp         | 142 +++++++
 modules/fastcv/test/test_warp.cpp             |  70 ++++
 51 files changed, 2917 insertions(+), 281 deletions(-)
 create mode 100644 modules/fastcv/include/opencv2/fastcv/blur.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/edges.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/pyramid.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/tracking.hpp
 create mode 100644 modules/fastcv/include/opencv2/fastcv/warp.hpp
 create mode 100644 modules/fastcv/perf/perf_blur.cpp
 create mode 100644 modules/fastcv/perf/perf_edges.cpp
 create mode 100644 modules/fastcv/perf/perf_fft_dct.cpp
 create mode 100644 modules/fastcv/perf/perf_pyramid.cpp
 create mode 100644 modules/fastcv/perf/perf_tracking.cpp
 create mode 100644 modules/fastcv/perf/perf_warp.cpp
 create mode 100644 modules/fastcv/src/blur.cpp
 create mode 100644 modules/fastcv/src/edges.cpp
 create mode 100644 modules/fastcv/src/ipptransform.cpp
 create mode 100644 modules/fastcv/src/pyramid.cpp
 create mode 100644 modules/fastcv/src/tracking.cpp
 create mode 100644 modules/fastcv/src/warp.cpp
 create mode 100644 modules/fastcv/test/test_blur.cpp
 create mode 100644 modules/fastcv/test/test_edges.cpp
 create mode 100644 modules/fastcv/test/test_ipptransform.cpp
 create mode 100644 modules/fastcv/test/test_pyramid.cpp
 create mode 100644 modules/fastcv/test/test_tracking.cpp
 create mode 100644 modules/fastcv/test/test_warp.cpp

diff --git a/modules/fastcv/README.md b/modules/fastcv/README.md
index 0c7323c086c..076a4108de0 100644
--- a/modules/fastcv/README.md
+++ b/modules/fastcv/README.md
@@ -3,5 +3,4 @@ FastCV extension for OpenCV
 
 This module provides wrappers for several FastCV functions not covered by the corresponding HAL in OpenCV or have implementation incompatible with OpenCV.
 Please note that:
-1. This module supports ARM architecture only. This means that CMake script aborts configuration under x86 platform even if you don't want to build binaries for your machine and just want to build docs or enable code analysis in your IDE. In that case you should fix CMakeLists.txt file as told inside it.
-2. Test data is stored in misc folder. Before running tests on a device you should copy the content of `misc/` folder to `$YOUR_TESTDATA_PATH/fastcv/` folder on a device.
+1. This module supports ARM architecture only. This means that CMake script will not configure or build under x86 platform.
\ No newline at end of file
diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
index fcf0bf132fb..6ed8eba4a33 100644
--- a/modules/fastcv/include/opencv2/fastcv.hpp
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -10,18 +10,24 @@
 
 #include "opencv2/fastcv/arithm.hpp"
 #include "opencv2/fastcv/bilateralFilter.hpp"
+#include "opencv2/fastcv/blur.hpp"
 #include "opencv2/fastcv/cluster.hpp"
 #include "opencv2/fastcv/draw.hpp"
+#include "opencv2/fastcv/edges.hpp"
 #include "opencv2/fastcv/fast10.hpp"
 #include "opencv2/fastcv/fft.hpp"
 #include "opencv2/fastcv/hough.hpp"
+#include "opencv2/fastcv/ipptransform.hpp"
 #include "opencv2/fastcv/moments.hpp"
 #include "opencv2/fastcv/mser.hpp"
+#include "opencv2/fastcv/pyramid.hpp"
 #include "opencv2/fastcv/remap.hpp"
 #include "opencv2/fastcv/scale.hpp"
 #include "opencv2/fastcv/shift.hpp"
 #include "opencv2/fastcv/smooth.hpp"
 #include "opencv2/fastcv/thresh.hpp"
+#include "opencv2/fastcv/tracking.hpp"
+#include "opencv2/fastcv/warp.hpp"
 
 /**
  * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
@@ -29,4 +35,4 @@
  * @}
  */
 
-#endif // OPENCV_FASTCV_ARITHM_HPP
+#endif // OPENCV_FASTCV_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/arithm.hpp b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
index e479d970b1d..5a0c43b2408 100644
--- a/modules/fastcv/include/opencv2/fastcv/arithm.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Matrix multiplication of two int8_t type matrices
-
+ *		  uses signed integer input/output whereas cv::gemm uses floating point input/output
+ *        matmuls8s32 provides enhanced speed on Qualcomm's processors
  * @param src1 First source matrix of type CV_8S
  * @param src2 Second source matrix of type CV_8S
  * @param dst Resulting matrix of type CV_32S
diff --git a/modules/fastcv/include/opencv2/fastcv/blur.hpp b/modules/fastcv/include/opencv2/fastcv/blur.hpp
new file mode 100644
index 00000000000..99d1cd3d655
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/blur.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_BLUR_HPP
+#define OPENCV_FASTCV_BLUR_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ */
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Gaussian blur with sigma = 0 and square kernel size. The way of handling borders is different with cv::GaussianBlur,
+ *        leading to slight variations in the output.
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1
+ * @param kernel_size Filer kernel size. One of 3, 5, 11
+ * @param blur_border If set to true, border is blurred by 0-padding adjacent values.(A variant of the constant border)
+ *                    If set to false, borders up to half-kernel width are ignored (e.g. 1 pixel in the 3x3 case).
+ *
+ * @sa GaussianBlur
+ */
+CV_EXPORTS_W void gaussianBlur(InputArray _src, OutputArray _dst, int kernel_size = 3, bool blur_border = true);
+
+/**
+ * @brief NxN correlation with non-separable kernel. Borders up to half-kernel width are ignored
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1, CV_16SC1 or CV_32FC1
+ * @param ddepth The depth of output image
+ * @param _kernel Filer kernel data
+ *
+ * @sa Filter2D
+ */
+CV_EXPORTS_W void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel);
+
+/**
+ * @brief NxN correlation with separable kernel. If srcImg and dstImg point to the same address and srcStride equals to dstStride,
+ *        it will do in-place. Borders up to half-kernel width are ignored.
+ *        The way of handling overflow is different with OpenCV, this function will do right shift for
+ *        the intermediate results and final result.
+ * @param _src Intput image with type CV_8UC1
+ * @param _dst Output image with type CV_8UC1, CV_16SC1
+ * @param ddepth The depth of output image
+ * @param _kernelX Filer kernel data in x direction
+ * @param _kernelY Filer kernel data in Y direction (For CV_16SC1, the kernelX and kernelY should be same)
+ *
+ * @sa sepFilter2D
+ */
+CV_EXPORTS_W void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY);
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_BLUR_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/cluster.hpp b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
index f90deeae465..46ac7ad103d 100644
--- a/modules/fastcv/include/opencv2/fastcv/cluster.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/cluster.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Clusterizes N input points in D-dimensional space into K clusters
- * 
+ *        Accepts 8-bit unsigned integer points
+ *        Provides faster execution time than cv::kmeans on Qualcomm's processors
  * @param points            Points array of type 8u, each row represets a point.
  *                          Size is N rows by D columns, can be non-continuous.
  * @param clusterCenters    Initial cluster centers array of type 32f, each row represents a center.
diff --git a/modules/fastcv/include/opencv2/fastcv/draw.hpp b/modules/fastcv/include/opencv2/fastcv/draw.hpp
index baa2b58c930..1abb5f55080 100644
--- a/modules/fastcv/include/opencv2/fastcv/draw.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/draw.hpp
@@ -17,7 +17,7 @@ namespace fastcv {
 /**
  * @brief Draw convex polygon
           This function fills the interior of a convex polygon with the specified color.
-
+          Requires the width and stride to be multple of 8.
  * @param img Image to draw on. Should have up to 4 8-bit channels
  * @param pts Array of polygon points coordinates. Should contain N two-channel or 2*N one-channel 32-bit integer elements
  * @param color Color of drawn polygon stored as B,G,R and A(if supported)
diff --git a/modules/fastcv/include/opencv2/fastcv/edges.hpp b/modules/fastcv/include/opencv2/fastcv/edges.hpp
new file mode 100644
index 00000000000..dd2677bf415
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/edges.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_EDGES_HPP
+#define OPENCV_EDGES_HPP
+
+#include "opencv2/core/mat.hpp"
+
+namespace cv {
+namespace fastcv {
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+ */
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Creates a 2D gradient image from source luminance data without normalization.
+ *        Calculate X direction 1 order derivative or Y direction 1 order derivative or both at the same time, .
+ * @param _src          Input image with type CV_8UC1
+ * @param _dx           Buffer to store horizontal gradient. Must be (dxyStride)*(height) bytes in size.
+ *                      If NULL, the horizontal gradient will not be calculated.
+ * @param _dy           Buffer to store vertical gradient. Must be (dxyStride)*(height) bytes in size.
+ *                      If NULL, the vertical gradient will not be calculated
+ * @param kernel_size   Sobel kernel size, support 3x3, 5x5, 7x7
+ * @param borderType    Border type, support BORDER_CONSTANT, BORDER_REPLICATE
+ * @param borderValue   Border value for constant border
+*/
+CV_EXPORTS_W void sobel(InputArray _src, OutputArray _dx, OutputArray _dy, int kernel_size, int borderType, int borderValue);
+
+/**
+ * @brief Creates a 2D gradient image from source luminance data without normalization.
+ *        This function computes central differences on 3x3 neighborhood and then convolves the result with Sobel kernel,
+ *        borders up to half-kernel width are ignored.
+ * @param _src          Input image with type CV_8UC1
+ * @param _dst          If _dsty is given, buffer to store horizontal gradient, otherwise, output 8-bit image of |dx|+|dy|.
+ *                      Size of buffer is (srcwidth)*(srcheight) bytes
+ * @param _dsty         (Optional)Buffer to store vertical gradient. Must be (srcwidth)*(srcheight) in size.
+ * @param ddepth        The depth of output image CV_8SC1,CV_16SC1,CV_32FC1,
+ * @param normalization If do normalization for the result
+*/
+CV_EXPORTS_W void sobel3x3u8(InputArray _src, OutputArray _dst, OutputArray _dsty = noArray(), int ddepth = CV_8U,
+    bool normalization = false);
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/modules/fastcv/include/opencv2/fastcv/fast10.hpp b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
index 1d97e9d0df7..1dd15ac198c 100644
--- a/modules/fastcv/include/opencv2/fastcv/fast10.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/fast10.hpp
@@ -15,9 +15,10 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Extracts FAST corners and scores from the image based on the mask.
-          The mask specifies pixels to be ignored by the detector
-
+ * @brief Extracts FAST10 corners and scores from the image based on the mask.
+ *        The mask specifies pixels to be ignored by the detector
+ *        designed for corner detection on Qualcomm's processors, provides enhanced speed.
+ *
  * @param src 8-bit grayscale image
  * @param mask Optional mask indicating which pixels should be omited from corner dection.
                Its size should be k times image width and height, where k = 1/2, 1/4 , 1/8 , 1, 2, 4 and 8
diff --git a/modules/fastcv/include/opencv2/fastcv/fft.hpp b/modules/fastcv/include/opencv2/fastcv/fft.hpp
index 88901a6a4f8..1aef585035b 100644
--- a/modules/fastcv/include/opencv2/fastcv/fft.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/fft.hpp
@@ -18,7 +18,7 @@ namespace fastcv {
  * @brief Computes the 1D or 2D Fast Fourier Transform of a real valued matrix.
           For the 2D case, the width and height of the input and output matrix must be powers of 2.
           For the 1D case, the height of the matrices must be 1, while the width must be a power of 2.
-
+          Accepts 8-bit unsigned integer array, whereas cv::dft accepts floating-point or complex array.
  * @param src Input array of CV_8UC1. The dimensions of the matrix must be powers of 2 for the 2D case,
               and in the 1D case, the height must be 1, while the width must be a power of 2.
  * @param dst The computed FFT matrix of type CV_32FC2. The FFT Re and Im coefficients are stored in different channels.
diff --git a/modules/fastcv/include/opencv2/fastcv/hough.hpp b/modules/fastcv/include/opencv2/fastcv/hough.hpp
index 74f78a10841..e43323903cb 100644
--- a/modules/fastcv/include/opencv2/fastcv/hough.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/hough.hpp
@@ -16,7 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Performs Hough Line detection
- * 
+ *
  * @param src Input 8-bit image containing binary contour. Width and step should be divisible by 8
  * @param lines Output array containing detected lines in a form of (x1, y1, x2, y2) where all numbers are 32-bit floats
  * @param threshold Controls the minimal length of a detected line. Value must be between 0.0 and 1.0
@@ -25,6 +25,27 @@ namespace fastcv {
  */
 CV_EXPORTS_W void houghLines(InputArray src, OutputArray lines, double threshold = 0.25);
 
+
+/**
+ * @brief Finds circles in a grayscale image using Hough transform.
+ *        The radius of circle varies from 0 to max(srcWidth, srcHeight).
+ *
+ * @param src Input 8-bit image containing binary contour. Step should be divisible by 8, data start should be 128-bit aligned
+ * @param circles Output array containing detected circles in a form (x, y, r) where all numbers are 32-bit integers
+ * @param minDist Minimum distance between the centers of the detected circles
+ * @param cannyThreshold The higher threshold of the two passed to the Canny() edge detector
+ *                       (the lower one is twice smaller). Default is 100.
+ * @param accThreshold The accumulator threshold for the circle centers at the detection
+ *                     stage. The smaller it is, the more false circles may be detected.
+ *                     Circles, corresponding to the larger accumulator values, will be
+ *                     returned first. Default is 100.
+ * @param minRadius Minimum circle radius, default is 0
+ * @param maxRadius Maximum circle radius, default is 0
+ */
+CV_EXPORTS_W void houghCircles(InputArray src, OutputArray circles, uint32_t minDist,
+                               uint32_t cannyThreshold = 100, uint32_t accThreshold = 100,
+                               uint32_t minRadius = 0, uint32_t maxRadius = 0);
+
 //! @}
 
 } // fastcv::
diff --git a/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
new file mode 100644
index 00000000000..42c8c94ea78
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/ipptransform.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_IPPTRANSFORM_HPP
+#define OPENCV_FASTCV_IPPTRANSFORM_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief This function performs 8x8 forward discrete Cosine transform on input image
+ * 		  accepts input of type 8-bit unsigned integer and produces output of type 16-bit signed integer
+ *		  provides faster execution time than cv::dct on Qualcomm's processor
+ * @param src Input image of type CV_8UC1
+ * @param dst Output image of type CV_16SC1
+ */
+CV_EXPORTS_W void DCT(InputArray src, OutputArray dst);
+
+/**
+ * @brief This function performs 8x8 inverse discrete Cosine transform on input image
+ * provides faster execution time than cv::dct in inverse case on Qualcomm's processor
+ * @param src Input image of type CV_16SC1
+ * @param dst Output image of type CV_8UC1
+ */
+CV_EXPORTS_W void IDCT(InputArray src, OutputArray dst);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_IPPTRANSFORM_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/moments.hpp b/modules/fastcv/include/opencv2/fastcv/moments.hpp
index 3cffa62f767..90034548571 100644
--- a/modules/fastcv/include/opencv2/fastcv/moments.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/moments.hpp
@@ -17,8 +17,9 @@ namespace fastcv {
 /**
  * @brief Calculates all of the moments up to the third order of the image pixels' intensities
           The results are returned in the structure cv::Moments.
- * @param _src Input image with type CV_8UC1, CV_32SC1, CV_32FC1
- * @param binary If 1, binary image (0x00-black, oxff-white); if 0, grayscale image
+ * @param _src      Input image with type CV_8UC1, CV_32SC1, CV_32FC1
+ * @param binary    If true, assumes the image to be binary (0x00 for black, 0xff for white), otherwise assumes the image to be
+ *                  grayscale.
  */
 CV_EXPORTS cv::Moments moments(InputArray _src, bool binary);
 
diff --git a/modules/fastcv/include/opencv2/fastcv/mser.hpp b/modules/fastcv/include/opencv2/fastcv/mser.hpp
index 78282b66fdd..bfa898544f5 100644
--- a/modules/fastcv/include/opencv2/fastcv/mser.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/mser.hpp
@@ -15,107 +15,98 @@ namespace fastcv {
 //! @{
 
 /**
- * @brief Structure containing additional information about found contour
+ * @brief MSER blob detector for grayscale images
  *
  */
-struct ContourData
+class CV_EXPORTS_W FCVMSER
 {
-    uint32_t variation;   //!< Variation of a contour from previous grey level
-    int32_t  polarity;    //!< Polarity for a contour. This value is 1 if this is a MSER+ region, -1 if this is a MSER- region.
-    uint32_t nodeId;      //!< Node ID for a contour
-    uint32_t nodeCounter; //!< Node counter for a contour
-};
+public:
 
-/**
- * @brief This is an overload for MSER() function
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief Structure containing additional information about found contour
+     *
+     */
+    struct ContourData
+    {
+        uint32_t variation;   //!< Variation of a contour from previous grey level
+        int32_t  polarity;    //!< Polarity for a contour. This value is 1 if this is a MSER+ region, -1 if this is a MSER- region.
+        uint32_t nodeId;      //!< Node ID for a contour
+        uint32_t nodeCounter; //!< Node counter for a contour
+    };
 
-/**
- * @brief This is an overload for MSER() function
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param boundingBoxes Array containing bounding boxes of found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief Creates MSER detector
+     *
+     * @param imgSize Image size. Image width has to be greater than 50, and image height has to be greater than 5.
+     * @param numNeighbors Number of neighbors in contours, can be 4 or 8
+     * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
+                    within which the region is stable ).
+                    Typical value range [0.8 8], typical value 2
+     * @param minArea Minimum area (number of pixels) of a mser contour.
+                      Typical value range [10 50], typical value 30
+     * @param maxArea Maximum area (number of pixels) of a  mser contour.
+                      Typical value 14400 or 0.25*width*height
+     * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
+                           Typical value range [0.1 1.0], typical value 0.15
+     * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
+                           Typical value range [0.1 1.0], typical value 0.2
+     * @return Feature detector object ready for detection
+     */
+    CV_WRAP static Ptr<FCVMSER> create( cv::Size     imgSize,
+                                        uint32_t numNeighbors = 4,
+                                        uint32_t delta = 2,
+                                        uint32_t minArea = 30,
+                                        uint32_t maxArea = 14400,
+                                        float        maxVariation = 0.15f,
+                                        float        minDiversity = 0.2f);
 
-/**
- * @brief Runs MSER blob detector on the grayscale image
- *
- * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
-              Pixels at the image boundary are not processed. If boundary pixels are important
-              for a particular application, please consider padding the input image with dummy
-              pixels of one pixel wide.
- * @param contours Array containing found contours
- * @param boundingBoxes Array containing bounding boxes of found contours
- * @param contourData Array containing additional information about found contours
- * @param numNeighbors Number of neighbors in contours, can be 4 or 8
- * @param delta Delta to be used in MSER algorithm (the difference in grayscale values
-                within which the region is stable ).
-                Typical value range [0.8 8], typical value 2
- * @param minArea Minimum area (number of pixels) of a mser contour.
-                Typical value range [10 50], typical value 30
- * @param maxArea Maximum area (number of pixels) of a  mser contour.
-                Typical value 14400 or 0.25*width*height
- * @param maxVariation Maximum variation in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.15
- * @param minDiversity Minimum diversity in grayscale between 2 levels allowed.
-                Typical value range [0.1 1.0], typical value 0.2
- */
-CV_EXPORTS void MSER(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                       std::vector<ContourData>& contourData,
-                       unsigned int numNeighbors = 4,
-                       unsigned int delta = 2,
-                       unsigned int minArea = 30,
-                       unsigned int maxArea = 14400,
-                       float        maxVariation = 0.15f,
-                       float        minDiversity = 0.2f);
+    /**
+     * @brief This is an overload for detect() function
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    */
+    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours) = 0;
+
+    /**
+     * @brief This is an overload for detect() function
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    * @param boundingBoxes Array containing bounding boxes of found contours
+    */
+    CV_WRAP virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) = 0;
+
+    /**
+     * @brief Runs MSER blob detector on the grayscale image
+     *
+     * @param src Source image of type CV_8UC1. Image width has to be greater than 50, and image height has to be greater than 5.
+                 Pixels at the image boundary are not processed. If boundary pixels are important
+                for a particular application, please consider padding the input image with dummy
+                pixels of one pixel wide.
+    * @param contours Array containing found contours
+    * @param boundingBoxes Array containing bounding boxes of found contours
+    * @param contourData Array containing additional information about found contours
+    */
+    virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                                std::vector<ContourData>& contourData) = 0;
+
+    CV_WRAP virtual cv::Size     getImgSize()      = 0;
+    CV_WRAP virtual uint32_t getNumNeighbors() = 0;
+    CV_WRAP virtual uint32_t getDelta()        = 0;
+    CV_WRAP virtual uint32_t getMinArea()      = 0;
+    CV_WRAP virtual uint32_t getMaxArea()      = 0;
+    CV_WRAP virtual float        getMaxVariation() = 0;
+    CV_WRAP virtual float        getMinDiversity() = 0;
+
+    virtual ~FCVMSER() {}
+};
 
 //! @}
 
diff --git a/modules/fastcv/include/opencv2/fastcv/pyramid.hpp b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
new file mode 100644
index 00000000000..6c20a21ab78
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_PYRAMID_HPP
+#define OPENCV_FASTCV_PYRAMID_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Creates a gradient pyramid from an image pyramid
+ *
+ * @param pyr Input pyramid of 1-channel 8-bit images. Only continuous images are supported.
+ * @param dx Horizontal Sobel gradient pyramid of the same size as pyr
+ * @param dy Verical Sobel gradient pyramid of the same size as pyr
+ * @param outType Type of output data, can be CV_8S, CV_16S or CV_32F
+ */
+CV_EXPORTS_W void sobelPyramid(InputArrayOfArrays pyr, OutputArrayOfArrays dx, OutputArrayOfArrays dy, int outType = CV_8S);
+
+/**
+ * @brief Builds an image pyramid of float32 arising from a single
+    original image - that are successively downscaled w.r.t. the
+    pre-set levels. This API supports both ORB scaling and scale down by half. 
+ *
+ * @param src Input single-channel image of type 8U or 32F
+ * @param pyr Output array containing nLevels downscaled image copies
+ * @param nLevels Number of pyramid levels to produce
+ * @param scaleBy2 to scale images 2x down or by a factor of 1/(2)^(1/4) which is approximated as 0.8408964 (ORB downscaling),
+ *                 ORB scaling is not supported for float point images
+ * @param borderType how to process border, the options are BORDER_REFLECT (maps to FASTCV_BORDER_REFLECT),
+ *                   BORDER_REFLECT_101 (maps to FASTCV_BORDER_REFLECT_V2) and BORDER_REPLICATE (maps to FASTCV_BORDER_REPLICATE).
+ *                   Other border types are mapped to FASTCV_BORDER_UNDEFINED(border pixels are ignored). Currently, borders only
+ *                   supported for downscaling by half, ignored for ORB scaling. Also ignored for float point images
+ * @param borderValue what value should be used to fill border, ignored for float point images
+ */
+CV_EXPORTS_W void buildPyramid(InputArray src, OutputArrayOfArrays pyr, int nLevels, bool scaleBy2 = true,
+                               int borderType = cv::BORDER_REFLECT, uint8_t borderValue = 0);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_PYRAMID_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/scale.hpp b/modules/fastcv/include/opencv2/fastcv/scale.hpp
index 8d7d084ac24..276b2304050 100644
--- a/modules/fastcv/include/opencv2/fastcv/scale.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/scale.hpp
@@ -16,6 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Down-scale the image by averaging each 2x2 pixel block.
+ * 		  This function is not bit-exact with cv::resize but provides faster execution time on Qualcomm's processor.
  * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 2
  * @param _dst The output image data, type CV_8UC1
 */
@@ -23,6 +24,7 @@ CV_EXPORTS_W void resizeDownBy2(cv::InputArray _src, cv::OutputArray _dst);
 
 /**
  * @brief Down-scale the image by averaging each 4x4 pixel block.
+ * 		  This function is not bit-exact with cv::resize but provides faster execution time on Qualcomm's processor.
  * @param _src The first input image data, type CV_8UC1, src height must be a multiple of 4
  * @param _dst The output image data, type CV_8UC1
 */
diff --git a/modules/fastcv/include/opencv2/fastcv/shift.hpp b/modules/fastcv/include/opencv2/fastcv/shift.hpp
index a545789f199..3ca2c22f2fc 100644
--- a/modules/fastcv/include/opencv2/fastcv/shift.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/shift.hpp
@@ -18,9 +18,12 @@ namespace fastcv {
  * @brief Applies the meanshift procedure and obtains the final converged position.
           This function applies the meanshift procedure to an original image (usually a probability image)
           and obtains the final converged position. The converged position search will stop either it has reached
-          the required accuracy or the maximum number of iterations.
+          the required accuracy or the maximum number of iterations. Moments used in the algorithm are calculated
+          in floating point.
+          This function isn't bit-exact with cv::meanShift but provides improved latency on Snapdragon processors.
 
- * @param src 8-bit grayscale image which is usually a probability image computed based on object histogram
+ * @param src 8-bit, 32-bit int or 32-bit float grayscale image which is usually a probability image
+ *            computed based on object histogram
  * @param rect Initial search window position which also returns the final converged window position
  * @param termCrit The criteria used to finish the MeanShift which consists of two termination criteria:
  *                 1) epsilon: required accuracy; 2) max_iter: maximum number of iterations
diff --git a/modules/fastcv/include/opencv2/fastcv/smooth.hpp b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
index a3cee45a3ce..2127ae5a23d 100644
--- a/modules/fastcv/include/opencv2/fastcv/smooth.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/smooth.hpp
@@ -20,6 +20,7 @@ namespace fastcv {
 Different from traditional bilateral filtering, here the smoothing is actually performed in gradient domain.
 The algorithm claims that it's more efficient than the original bilateral filtering in both image quality and computation.
 See algorithm description in the paper Recursive Bilateral Filtering, ECCV2012 by Prof Yang Qingxiong
+This function isn't bit-exact with cv::bilateralFilter but provides improved latency on Snapdragon processors.
  * @param src Input image, should have one CV_8U channel
  * @param dst Output array having one CV_8U channel
  * @param sigmaColor Sigma in the color space, the bigger the value the more color difference is smoothed by the algorithm
diff --git a/modules/fastcv/include/opencv2/fastcv/thresh.hpp b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
index 878761d75d5..418f98a012d 100644
--- a/modules/fastcv/include/opencv2/fastcv/thresh.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
@@ -17,7 +17,7 @@ namespace fastcv {
 /**
  * @brief Binarizes a grayscale image based on a pair of threshold values. The binarized image will be in the two values
  *        selected by user
-
+ *        this function provides improved latency on Snapdragon processor.
  * @param src 8-bit grayscale image
  * @param dst Output image of the same size and type as input image, can be the same as input image
  * @param lowThresh The lower threshold value for binarization
diff --git a/modules/fastcv/include/opencv2/fastcv/tracking.hpp b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
new file mode 100644
index 00000000000..9cca92c1239
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/tracking.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_TRACKING_HPP
+#define OPENCV_FASTCV_TRACKING_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Calculates sparse optical flow using Lucas-Kanade algorithm
+ *		  accepts 8-bit unsigned integer image
+ *		  Provides faster execution time on Qualcomm's processor 
+ * @param src Input single-channel image of type 8U, initial motion frame
+ * @param dst Input single-channel image of type 8U, final motion frame, should have the same size and stride as initial frame
+ * @param srcPyr Pyramid built from intial motion frame
+ * @param dstPyr Pyramid built from final motion frame
+ * @param ptsIn Array of initial subpixel coordinates of starting points, should contain 32F 2D elements
+ * @param ptsOut Output array of calculated final points, should contain 32F 2D elements
+ * @param ptsEst Input array of estimations for final points, should contain 32F 2D elements, can be empty
+ * @param statusVec Output array of int32 values indicating status of each feature, can be empty
+ * @param winSize Size of window for optical flow searching. Width and height ust be odd numbers. Suggested values are 5, 7 or 9
+ * @param termCriteria Termination criteria containing max number of iterations, max epsilon and stop condition
+ */
+void trackOpticalFlowLK(InputArray src, InputArray dst,
+                        InputArrayOfArrays srcPyr, InputArrayOfArrays dstPyr,
+                        InputArray ptsIn, OutputArray ptsOut, InputArray ptsEst,
+                        OutputArray statusVec, cv::Size winSize = {7, 7},
+                        cv::TermCriteria termCriteria = {cv::TermCriteria::MAX_ITER | cv::TermCriteria::EPS,
+                                                         /* maxIterations */ 7,
+                                                         /* maxEpsilon */ 0.03f * 0.03f});
+
+/**
+ * @brief Overload for v1 of the LK tracking function
+ *
+ * @param src Input single-channel image of type 8U, initial motion frame
+ * @param dst Input single-channel image of type 8U, final motion frame, should have the same size and stride as initial frame
+ * @param srcPyr Pyramid built from intial motion frame
+ * @param dstPyr Pyramid built from final motion frame
+ * @param srcDxPyr Pyramid of Sobel derivative by X of srcPyr
+ * @param srcDyPyr Pyramid of Sobel derivative by Y of srcPyr
+ * @param ptsIn Array of initial subpixel coordinates of starting points, should contain 32F 2D elements
+ * @param ptsOut Output array of calculated final points, should contain 32F 2D elements
+ * @param statusVec Output array of int32 values indicating status of each feature, can be empty
+ * @param winSize Size of window for optical flow searching. Width and height ust be odd numbers. Suggested values are 5, 7 or 9
+ * @param maxIterations Maximum number of iterations to try
+ */
+void trackOpticalFlowLK(InputArray src, InputArray dst,
+                        InputArrayOfArrays srcPyr, InputArrayOfArrays dstPyr,
+                        InputArrayOfArrays srcDxPyr, InputArrayOfArrays srcDyPyr,
+                        InputArray ptsIn, OutputArray ptsOut,
+                        OutputArray statusVec, cv::Size winSize = {7, 7}, int maxIterations = 7);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_TRACKING_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/warp.hpp b/modules/fastcv/include/opencv2/fastcv/warp.hpp
new file mode 100644
index 00000000000..8f58cd36577
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/warp.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_WARP_HPP
+#define OPENCV_WARP_HPP
+
+#include <opencv2/imgproc.hpp>
+namespace cv {
+namespace fastcv {
+
+/**
+ * @defgroup fastcv Module-wrapper for FastCV hardware accelerated functions
+*/
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Perspective warp two images using the same transformation. Bi-linear interpolation is used where applicable.
+ *        For example, to warp a grayscale image and an alpha image at the same time, or warp two color channels.
+ * @param _src1     First input 8-bit image. Size of buffer is src1Stride*srcHeight bytes.
+ * @param _src2     Second input 8-bit image. Size of buffer is src2Stride*srcHeight bytes.
+ * @param _dst1     First warped output image (correspond to src1). Size of buffer is dst1Stride*dstHeight bytes, type CV_8UC1
+ * @param _dst2     Second warped output image (correspond to src2). Size of buffer is dst2Stride*dstHeight bytes, type CV_8UC1
+ * @param _M0       The 3x3 perspective transformation matrix (inversed map)
+ * @param dsize     The output image size
+*/
+CV_EXPORTS_W void warpPerspective2Plane(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2,
+    InputArray _M0, Size dsize);
+
+//! @}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_bilateral.cpp b/modules/fastcv/perf/perf_bilateral.cpp
index bb985da391d..63323d459cc 100644
--- a/modules/fastcv/perf/perf_bilateral.cpp
+++ b/modules/fastcv/perf/perf_bilateral.cpp
@@ -7,10 +7,10 @@
 
 namespace opencv_test {
 
-typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/> BilateralPerfParams;
-typedef perf::TestBaseWithParam<BilateralPerfParams> BilateralPerfTest;
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/> BilateralRecursivePerfParams;
+typedef perf::TestBaseWithParam<BilateralRecursivePerfParams> BilateralRecursivePerfTest;
 
-PERF_TEST_P(BilateralPerfTest, run,
+PERF_TEST_P(BilateralRecursivePerfTest, run,
     ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
                        ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f))
            )
@@ -32,14 +32,15 @@ PERF_TEST_P(BilateralPerfTest, run,
     SANITY_CHECK_NOTHING();
 }
 
-typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/, cv::Size, int > BilateralPerfParams2;
-typedef perf::TestBaseWithParam<BilateralPerfParams2> BilateralPerfTest2;
 
+typedef std::tuple<float /*sigmaColor*/, float /*sigmaSpace*/, cv::Size, int > BilateralPerfParams;
+typedef perf::TestBaseWithParam<BilateralPerfParams> BilateralPerfTest;
 
-PERF_TEST_P(BilateralPerfTest2, run,
+
+PERF_TEST_P(BilateralPerfTest, run,
     ::testing::Combine(::testing::Values(0.01f, 0.03f, 0.1f, 1.f, 5.f),
                        ::testing::Values(0.01f, 0.05f, 0.1f, 1.f, 5.f),
-					   ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
+                       ::testing::Values(Size(8, 8), Size(640, 480), Size(800, 600)),
                        ::testing::Values(5, 7, 9))
            )
 {
@@ -47,17 +48,17 @@ PERF_TEST_P(BilateralPerfTest2, run,
     float sigmaColor = std::get<0>(p);
     float sigmaSpace = std::get<1>(p);
     cv::Size size  = std::get<2>(p);
-	int d = get<3>(p);
+    int d = get<3>(p);
 
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
     Mat dst;
 
-    for (;  next(); )
+    while (next())
     {
         startTimer();
-		cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+        cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
         stopTimer();
     }
 
diff --git a/modules/fastcv/perf/perf_blur.cpp b/modules/fastcv/perf/perf_blur.cpp
new file mode 100644
index 00000000000..bca8f80974a
--- /dev/null
+++ b/modules/fastcv/perf/perf_blur.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int, bool>> GaussianBlurPerfTest;
+
+PERF_TEST_P(GaussianBlurPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S,CV_32S),                      // image depth
+                       ::testing::Values(3, 5),                                     // kernel size
+                       ::testing::Values(true,false)                                // blur border
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+    bool border = get<3>(GetParam());
+
+    // For some cases FastCV not support, so skip them
+    if((ksize!=5) && (depth!=CV_8U))
+        throw ::perf::TestBase::PerfSkipTestException();
+
+    cv::Mat src(srcSize, depth);
+    cv::Mat dst;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::gaussianBlur(src, dst, ksize, border);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> Filter2DPerfTest;
+
+PERF_TEST_P(Filter2DPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S,CV_32F),                      // dst image depth
+                       ::testing::Values(3, 5, 7, 9, 11)                            // kernel size
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel;
+    cv::Mat dst;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            kernel.create(ksize,ksize,CV_8S);
+            break;
+        }
+        case CV_32F:
+        {
+            kernel.create(ksize,ksize,CV_32F);
+            break;
+        }
+        default:
+            break;
+    }
+
+    cv::randu(src, 0, 256);
+    cv::randu(kernel, INT8_MIN, INT8_MAX);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::filter2D(src, dst, ddepth, kernel);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> SepFilter2DPerfTest;
+
+PERF_TEST_P(SepFilter2DPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_16S),                             // dst image depth
+                       ::testing::Values(3, 5, 7, 9, 11, 13, 15, 17)                // kernel size
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+
+    cv::Mat src(srcSize, ddepth);
+    cv::Mat kernel(1, ksize, ddepth);
+    cv::Mat dst;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sepFilter2D(src, dst, ddepth, kernel, kernel);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_edges.cpp b/modules/fastcv/perf/perf_edges.cpp
new file mode 100644
index 00000000000..74ffa552124
--- /dev/null
+++ b/modules/fastcv/perf/perf_edges.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int, int>> SobelPerfTest;
+
+PERF_TEST_P(SobelPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(3,5,7),                                    // kernel size
+                       ::testing::Values(BORDER_CONSTANT, BORDER_REPLICATE),        // border type
+                       ::testing::Values(0)                                         // border value
+                       )
+           )
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize = get<1>(GetParam());
+    int border = get<2>(GetParam());
+    int borderValue = get<3>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sobel(src,dx,dy,ksize,border,borderValue);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<tuple<Size, int, int>> Sobel3x3u8PerfTest;
+
+PERF_TEST_P(Sobel3x3u8PerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8S, CV_16S, CV_32F),                    // image depth
+                       ::testing::Values(0, 1)                                      // normalization
+                       )
+           )
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+    int normalization = get<2>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U);
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    if((normalization ==0) && (ddepth == CV_8S))
+        throw ::perf::TestBase::PerfSkipTestException();
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::sobel3x3u8(src, dx, dy, ddepth, normalization);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+} //namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_fft_dct.cpp b/modules/fastcv/perf/perf_fft_dct.cpp
new file mode 100644
index 00000000000..30e4e68ce62
--- /dev/null
+++ b/modules/fastcv/perf/perf_fft_dct.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<cv::Size> FFTExtPerfTest;
+
+PERF_TEST_P_(FFTExtPerfTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat dst;
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::FFT(src, dst);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(FFTExtPerfTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat fwd, back;
+    cv::fastcv::FFT(src, fwd);
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::IFFT(fwd, back);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, FFTExtPerfTest,
+    ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512),
+                      Size(32, 1), Size(512, 1)));
+
+/// DCT ///
+
+typedef perf::TestBaseWithParam<cv::Size> DCTExtPerfTest;
+
+PERF_TEST_P_(DCTExtPerfTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat dst, ref;
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::DCT(src, dst);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(DCTExtPerfTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat fwd, back;
+    cv::fastcv::DCT(src, fwd);
+
+    while(next())
+    {
+        startTimer();
+        cv::fastcv::IDCT(fwd, back);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, DCTExtPerfTest,
+    ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512)));
+} // namespace
diff --git a/modules/fastcv/perf/perf_mser.cpp b/modules/fastcv/perf/perf_mser.cpp
index 4e1a6ce80af..7232cd47cb4 100644
--- a/modules/fastcv/perf/perf_mser.cpp
+++ b/modules/fastcv/perf/perf_mser.cpp
@@ -30,36 +30,37 @@ PERF_TEST_P(MSERPerfTest, run,
 
     cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
 
-    unsigned int delta = 2;
-    unsigned int minArea = 256;
-    unsigned int maxArea = (int)src.total()/4;
+    uint32_t delta = 2;
+    uint32_t minArea = 256;
+    uint32_t maxArea = (int)src.total()/4;
     float        maxVariation = 0.15f;
     float        minDiversity = 0.2f;
 
+    cv::Ptr<cv::fastcv::FCVMSER> mser;
+    mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+                                    maxVariation, minDiversity);
+
     while(next())
     {
         std::vector<std::vector<Point>> contours;
         std::vector<cv::Rect> bboxes;
-        std::vector<cv::fastcv::ContourData> contourData;
+        std::vector<cv::fastcv::FCVMSER::ContourData> contourData;
 
         startTimer();
         if (useBboxes)
         {
             if (useContourData)
             {
-                cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
-                                 delta, minArea, maxArea, maxVariation, minDiversity);
+                mser->detect(src, contours, bboxes, contourData);
             }
             else
             {
-                cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
-                                 delta, minArea, maxArea, maxVariation, minDiversity);
+                mser->detect(src, contours, bboxes);
             }
         }
         else
         {
-            cv::fastcv::MSER(src, contours, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours);
         }
         stopTimer();
     }
diff --git a/modules/fastcv/perf/perf_pyramid.cpp b/modules/fastcv/perf/perf_pyramid.cpp
new file mode 100644
index 00000000000..27c0fae8d59
--- /dev/null
+++ b/modules/fastcv/perf/perf_pyramid.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<bool /*useFloat*/, int /*nLevels*/, bool /*scaleBy2*/> PyramidTestParams;
+class PyramidTest : public ::perf::TestBaseWithParam<PyramidTestParams> { };
+
+PERF_TEST_P(PyramidTest, checkAllVersions, // version, useFloat, nLevels
+                        ::testing::Values(
+                            PyramidTestParams { true, 2,  true}, PyramidTestParams { true, 3,  true}, PyramidTestParams { true, 4,  true},
+                            PyramidTestParams {false, 2,  true}, PyramidTestParams {false, 3,  true}, PyramidTestParams {false, 4,  true},
+                            PyramidTestParams {false, 2, false}, PyramidTestParams {false, 3, false}, PyramidTestParams {false, 4, false}
+                            ))
+{
+    auto par = GetParam();
+
+    bool useFloat = std::get<0>(par);
+    int  nLevels  = std::get<1>(par);
+    bool scaleBy2 = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    if (useFloat)
+    {
+        cv::Mat f;
+        src.convertTo(f, CV_32F);
+        src = f;
+    }
+
+    while(next())
+    {
+        std::vector<cv::Mat> pyr;
+        startTimer();
+        cv::fastcv::buildPyramid(src, pyr, nLevels, scaleBy2);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+
+typedef std::tuple<MatType, size_t> SobelPyramidTestParams;
+class SobelPyramidTest : public ::perf::TestBaseWithParam<SobelPyramidTestParams> {};
+
+PERF_TEST_P(SobelPyramidTest, checkAllTypes,
+    ::testing::Combine(::testing::Values(CV_8S, CV_16S, CV_32F),
+                       ::testing::Values(3, 6)))
+{
+    auto p = GetParam();
+    int    type    = std::get<0>(p);
+    size_t nLevels = std::get<1>(p);
+
+    // NOTE: test files should be manually loaded to folder on a device, for example like this:
+    // adb push fastcv/misc/bilateral_recursive/ /sdcard/testdata/fastcv/bilateral/
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels);
+
+    while(next())
+    {
+        std::vector<cv::Mat> pyrDx, pyrDy;
+        startTimer();
+        cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_tracking.cpp b/modules/fastcv/perf/perf_tracking.cpp
new file mode 100644
index 00000000000..fc5d10eccdf
--- /dev/null
+++ b/modules/fastcv/perf/perf_tracking.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<int /*winSize*/, bool /*useSobelPyramid*/, bool /*useInitialEstimate*/ > TrackingTestParams;
+class TrackingTest : public ::perf::TestBaseWithParam<TrackingTestParams> {};
+
+PERF_TEST_P(TrackingTest, checkAllVersions,
+    ::testing::Combine(::testing::Values(5, 7, 9), // window size
+                       ::testing::Bool(),          // useSobelPyramid
+                       ::testing::Bool()           // useInitialEstimate
+                      ))
+{
+    auto par = GetParam();
+
+    int winSz               = std::get<0>(par);
+    bool useSobelPyramid    = std::get<1>(par);
+    bool useInitialEstimate = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    double ang = 5.0 * CV_PI / 180.0;
+    cv::Matx33d tr = {
+        cos(ang), -sin(ang), 1,
+        sin(ang),  cos(ang), 2,
+               0,         0, 1
+    };
+    cv::Matx33d orig {
+        1, 0, -(double)src.cols / 2,
+        0, 1, -(double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx33d back {
+        1, 0, (double)src.cols / 2,
+        0, 1, (double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx23d trans = (back * tr * orig).get_minor<2, 3>(0, 0);
+
+    cv::Mat dst;
+    cv::warpAffine(src, dst, trans, src.size());
+
+    int nLevels = 4;
+    std::vector<cv::Mat> srcPyr, dstPyr;
+
+    cv::buildPyramid(src, srcPyr, nLevels - 1);
+    cv::buildPyramid(dst, dstPyr, nLevels - 1);
+
+    cv::Matx23f transf = trans;
+    int nPts = 32;
+    std::vector<cv::Point2f> ptsIn, ptsEst, ptsExpected;
+    for (int i = 0; i < nPts; i++)
+    {
+        cv::Point2f p { (((float)cv::theRNG())*0.5f + 0.25f) * src.cols,
+                        (((float)cv::theRNG())*0.5f + 0.25f) * src.rows };
+        ptsIn.push_back(p);
+        ptsExpected.push_back(transf * cv::Vec3f(p.x, p.y, 1.0));
+        ptsEst.push_back(p);
+    }
+
+    cv::TermCriteria termCrit;
+    termCrit.type = cv::TermCriteria::COUNT | cv::TermCriteria::EPS;
+    termCrit.maxCount = 7;
+    termCrit.epsilon = 0.03f * 0.03f;
+
+    std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+    if (useSobelPyramid)
+    {
+        cv::fastcv::sobelPyramid(srcPyr, srcDxPyr, srcDyPyr, CV_8S);
+    }
+
+    while(next())
+    {
+        std::vector<int32_t> statusVec(nPts);
+        std::vector<cv::Point2f> ptsOut(nPts);
+        startTimer();
+        if (useSobelPyramid)
+        {
+            cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, srcDxPyr, srcDyPyr,
+                                           ptsIn, ptsOut, statusVec, {winSz, winSz});
+        }
+        else
+        {
+            cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, ptsIn, ptsOut, (useInitialEstimate ? ptsEst : noArray()),
+                                           statusVec, {winSz, winSz}, termCrit);
+        }
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/perf/perf_warp.cpp b/modules/fastcv/perf/perf_warp.cpp
new file mode 100644
index 00000000000..231056aef56
--- /dev/null
+++ b/modules/fastcv/perf/perf_warp.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<Size> WarpPerspective2PlanePerfTest;
+
+PERF_TEST_P(WarpPerspective2PlanePerfTest, run,
+    ::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p))
+{
+    cv::Size dstSize = GetParam();
+    cv::Mat img = imread(cvtest::findDataFile("cv/shared/baboon.png"));
+    Mat src(img.rows, img.cols, CV_8UC1);
+    cvtColor(img,src,cv::COLOR_BGR2GRAY);
+    cv::Mat dst1, dst2, mat;
+    mat.create(3,3,CV_32FC1);
+    dst1.create(dstSize,CV_8UC1);
+    dst2.create(dstSize,CV_8UC1);
+
+    RNG& rng = cv::theRNG();
+    Point2f s[4], d[4];
+
+    s[0] = Point2f(0,0);
+    d[0] = Point2f(0,0);
+    s[1] = Point2f(src.cols-1.f,0);
+    d[1] = Point2f(dst1.cols-1.f,0);
+    s[2] = Point2f(src.cols-1.f,src.rows-1.f);
+    d[2] = Point2f(dst1.cols-1.f,dst1.rows-1.f);
+    s[3] = Point2f(0,src.rows-1.f);
+    d[3] = Point2f(0,dst1.rows-1.f);
+
+    float buffer[16];
+    Mat tmp( 1, 16, CV_32FC1, buffer );
+    rng.fill( tmp, 1, Scalar::all(0.), Scalar::all(0.1) );
+
+    for(int i = 0; i < 4; i++ )
+    {
+        s[i].x += buffer[i*4]*src.cols/2;
+        s[i].y += buffer[i*4+1]*src.rows/2;
+        d[i].x += buffer[i*4+2]*dst1.cols/2;
+        d[i].y += buffer[i*4+3]*dst1.rows/2;
+    }
+
+    cv::getPerspectiveTransform( s, d ).convertTo( mat, mat.depth() );
+    // Invert the perspective matrix
+    invert(mat,mat);
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::warpPerspective2Plane(src, src, dst1, dst2, mat, dstSize);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} //namespace
\ No newline at end of file
diff --git a/modules/fastcv/src/bilateralFilter.cpp b/modules/fastcv/src/bilateralFilter.cpp
index 1cd0ece6b14..a0995347b24 100644
--- a/modules/fastcv/src/bilateralFilter.cpp
+++ b/modules/fastcv/src/bilateralFilter.cpp
@@ -12,54 +12,45 @@ class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
 {
 public:
 
-    FcvFilterLoop_Invoker(cv::Mat src_, size_t src_step_, cv::Mat dst_, size_t dst_step_, int width_, int height_,  int bdr_, int knl_, float32_t sigma_color_, float32_t sigma_space_) :
+    FcvFilterLoop_Invoker(cv::Mat src_, size_t src_step_, cv::Mat dst_, size_t dst_step_, int width_, int height_,
+                          int bdr_, int knl_, float32_t sigma_color_, float32_t sigma_space_) :
         cv::ParallelLoopBody(), src_step(src_step_), dst_step(dst_step_), width(width_), height(height_),
         bdr(bdr_), knl(knl_), sigma_color(sigma_color_), sigma_space(sigma_space_), src(src_), dst(dst_)
-    {
-    }
+    { }
 
     virtual void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-
-        fcvStatus status = FASTCV_SUCCESS;
-		int height_ = range.end - range.start;
+        int height_ = range.end - range.start;
         int width_  = width;
 		cv::Mat src_;
 		int n = knl/2;
 
-		if(range.start == 0 && range.end == height)
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
-		}
-		else if(range.start == 0)
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
-		}
-		else if(range.end == (height))
+        src_ = cv::Mat(height_ + 2 * n, width_ + 2 * n, CV_8U);
+        if (range.start == 0 && range.end == height)
         {
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
-		}
-		else
-		{
-			src_ = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
-			cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2*n)), src_, 0, 0, n, n, bdr);
-		}
-
+            cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
+        }
+        else if (range.start == 0)
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, 0, width_, height_ + n)), src_, n, 0, n, n, bdr);
+        }
+        else if (range.end == (height))
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + n)), src_, 0, n, n, n, bdr);
+        }
+        else
+        {
+            cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2 * n)), src_, 0, 0, n, n, bdr);
+        }
 
 		cv::Mat dst_padded = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
 
-		if(knl == 5)
-		    status = fcvBilateralFilter5x5u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
-		else if(knl == 7)
-		    status = fcvBilateralFilter7x7u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
-		else if(knl == 9)
-		    status = fcvBilateralFilter9x9u8_v3(src_.data, width_ + 2*n, height_ + 2*n, width_ + 2*n,
-		                                        dst_padded.data, width_ + 2*n, sigma_color, sigma_space, 0);
+        auto func = (knl == 5) ? fcvBilateralFilter5x5u8_v3 :
+                    (knl == 7) ? fcvBilateralFilter7x7u8_v3 :
+                    (knl == 9) ? fcvBilateralFilter9x9u8_v3 :
+                    nullptr;
+        func(src_.data, width_ + 2 * n, height_ + 2 * n, width_ + 2 * n,
+             dst_padded.data, width_ + 2 * n, sigma_color, sigma_space, 0);
 
 		cv::Mat dst_temp1 = dst_padded(cv::Rect(n, n, width_, height_));
 		cv::Mat dst_temp2 = dst(cv::Rect(0, range.start, width_, height_));
@@ -97,20 +88,21 @@ void bilateralFilter( InputArray _src, OutputArray _dst, int d,
     Size size = _src.size();
 	_dst.create( size, type );
     Mat src = _src.getMat();
-	Mat dst = _dst.getMat();
+    Mat dst = _dst.getMat();
+
+    CV_Assert(src.data != dst.data);
 
     if( sigmaColor <= 0 )
+	{
         sigmaColor = 1;
+	}
     if( sigmaSpace <= 0 )
+	{
         sigmaSpace = 1;
+	}
 
-	int nStripes = 1;
-	if(src.rows/20 == 0)
-		nStripes = 1;
-	else
-		nStripes = (src.rows/20);
-
-	cv::parallel_for_(cv::Range(0, src.rows),
+    int nStripes = (src.rows / 20 == 0) ? 1 : (src.rows / 20);
+    cv::parallel_for_(cv::Range(0, src.rows),
               FcvFilterLoop_Invoker(src, src.step, dst, dst.step, src.cols, src.rows, borderType, d, sigmaColor, sigmaSpace), nStripes);
 }
 
diff --git a/modules/fastcv/src/blur.cpp b/modules/fastcv/src/blur.cpp
new file mode 100644
index 00000000000..66058a37b5a
--- /dev/null
+++ b/modules/fastcv/src/blur.cpp
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvGaussianBlurLoop_Invoker : public ParallelLoopBody
+{
+    public:
+
+    FcvGaussianBlurLoop_Invoker(const Mat& _src, Mat& _dst, int _ksize, fcvBorderType _fcvBorder, int _fcvBorderValue) :
+        ParallelLoopBody(), src(_src),dst(_dst), ksize(_ksize), fcvBorder(_fcvBorder), fcvBorderValue(_fcvBorderValue)
+    {
+        width       = src.cols;
+        height      = src.rows;
+        halfKsize   = ksize / 2;
+        fcvFuncType = FCV_MAKETYPE(ksize, src.depth());
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        if(range.start != 0)
+        {
+            topLines     += halfKsize;
+            paddedHeight += halfKsize;
+        }
+
+        if(range.end != height)
+        {
+            paddedHeight += halfKsize;
+        }
+
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
+
+        if (fcvFuncType == FCV_MAKETYPE(3,CV_8U))
+            fcvFilterGaussian3x3u8_v4(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_8U))
+            fcvFilterGaussian5x5u8_v3(srcPadded.data, width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_16S))
+            fcvFilterGaussian5x5s16_v3((int16_t*)srcPadded.data, width, paddedHeight, srcPadded.step, (int16_t*)dstPadded.data,
+                dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(5,CV_32S))
+            fcvFilterGaussian5x5s32_v3((int32_t*)srcPadded.data, width, paddedHeight, srcPadded.step, (int32_t*)dstPadded.data,
+                dstPadded.step, fcvBorder, 0);
+        else if (fcvFuncType == FCV_MAKETYPE(11,CV_8U))
+            fcvFilterGaussian11x11u8_v2(srcPadded.data, width, rangeHeight, srcPadded.step, dstPadded.data, dstPadded.step, fcvBorder);
+
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const Mat&      src;
+    Mat&            dst;
+    int             width;
+    int             height;
+    const int       ksize;
+    int             halfKsize;
+    int             fcvFuncType;
+    fcvBorderType   fcvBorder;
+    int             fcvBorderValue;
+
+    FcvGaussianBlurLoop_Invoker(const FcvGaussianBlurLoop_Invoker &);  // = delete;
+    const FcvGaussianBlurLoop_Invoker& operator= (const FcvGaussianBlurLoop_Invoker &);  // = delete;
+};
+
+void gaussianBlur(InputArray _src, OutputArray _dst, int kernel_size, bool blur_border)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && CV_MAT_CN(_src.type()) == 1);
+
+    Size size = _src.size();
+    int type  = _src.type();
+    _dst.create( size, type );
+
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
+
+    fcvBorderType fcvBorder = blur_border ? FASTCV_BORDER_ZERO_PADDING : FASTCV_BORDER_UNDEFINED;
+
+    if (((type == CV_8UC1)  && ((kernel_size == 3) || (kernel_size == 5) || (kernel_size == 11)))  ||
+        ((type == CV_16SC1) && (kernel_size == 5)) ||
+        ((type == CV_32SC1) && (kernel_size == 5)))
+    {
+        parallel_for_(Range(0, src.rows), FcvGaussianBlurLoop_Invoker(src, dst, kernel_size, fcvBorder, 0), nStripes);
+    }
+    else
+        CV_Error(cv::Error::StsBadArg, cv::format("Src type %d, kernel size %d is not supported", type, kernel_size));
+}
+
+class FcvFilter2DLoop_Invoker : public ParallelLoopBody
+{
+    public:
+
+    FcvFilter2DLoop_Invoker(const Mat& _src, Mat& _dst, const Mat& _kernel) :
+        ParallelLoopBody(), src(_src), dst(_dst), kernel(_kernel)
+    {
+        width     = src.cols;
+        height    = src.rows;
+        ksize     = kernel.size().width;
+        halfKsize = ksize/2;
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        if(range.start >= halfKsize)
+        {
+            topLines    += halfKsize;
+            paddedHeight += halfKsize;
+        }
+
+        if(range.end <= height-halfKsize)
+        {
+            paddedHeight += halfKsize;
+        }
+
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
+
+        if (dst.depth() == CV_8U)
+            fcvFilterCorrNxNu8((int8_t*)kernel.data, ksize, 0, srcPadded.data, width, paddedHeight, srcPadded.step,
+                dstPadded.data, dstPadded.step);
+        else if (dst.depth() == CV_16S)
+            fcvFilterCorrNxNu8s16((int8_t*)kernel.data, ksize, 0, srcPadded.data, width, paddedHeight, srcPadded.step,
+                (int16_t*)dstPadded.data, dstPadded.step);
+        else if (dst.depth() == CV_32F)
+            fcvFilterCorrNxNu8f32((float32_t*)kernel.data, ksize, srcPadded.data, width, paddedHeight, srcPadded.step,
+                (float32_t*)dstPadded.data, dstPadded.step);
+
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const Mat&  src;
+    Mat&        dst;
+    const Mat&  kernel;
+    int         width;
+    int         height;
+    int         ksize;
+    int         halfKsize;
+
+    FcvFilter2DLoop_Invoker(const FcvFilter2DLoop_Invoker &);  // = delete;
+    const FcvFilter2DLoop_Invoker& operator= (const FcvFilter2DLoop_Invoker &);  // = delete;
+};
+
+void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernel)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Mat kernel = _kernel.getMat();
+    Size ksize = kernel.size();
+    CV_Assert(ksize.width == ksize.height);
+    CV_Assert(ksize.width % 2 == 1);
+
+    _dst.create(_src.size(), ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_8S);
+            parallel_for_(Range(0, src.rows), FcvFilter2DLoop_Invoker(src, dst, kernel), nStripes);
+            break;
+        }
+        case CV_32F:
+        {
+            CV_Assert(CV_MAT_DEPTH(kernel.type()) == CV_32F);
+            parallel_for_(Range(0, src.rows), FcvFilter2DLoop_Invoker(src, dst, kernel), nStripes);
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Kernel Size:%d, Dst type:%s is not supported", ksize.width,
+                depthToString(ddepth)));
+            break;
+        }
+    }
+}
+
+class FcvSepFilter2DLoop_Invoker : public ParallelLoopBody
+{
+    public:
+
+    FcvSepFilter2DLoop_Invoker(const Mat& _src, Mat& _dst, const Mat& _kernelX, const Mat& _kernelY) :
+        ParallelLoopBody(), src(_src), dst(_dst), kernelX(_kernelX), kernelY(_kernelY)
+    {
+        width       = src.cols;
+        height      = src.rows;
+        kernelXSize = kernelX.size().width;
+        kernelYSize = kernelY.size().width;
+        halfKsize   = kernelXSize/2;
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int topLines     = 0;
+        int rangeHeight  = range.end-range.start;
+        int paddedHeight = rangeHeight;
+
+        if(range.start >= halfKsize)
+        {
+            topLines     += halfKsize;
+            paddedHeight += halfKsize;
+        }
+
+        if(range.end <= height-halfKsize)
+        {
+            paddedHeight += halfKsize;
+        }
+
+        const Mat srcPadded = src(Rect(0, range.start - topLines, width, paddedHeight));
+        Mat dstPadded       = Mat(paddedHeight, width, dst.depth());
+
+        switch (dst.depth())
+        {
+            case CV_8U:
+            {
+                fcvFilterCorrSepMxNu8((int8_t*)kernelX.data, kernelXSize, (int8_t*)kernelY.data, kernelYSize, 0, srcPadded.data,
+                    width, paddedHeight, srcPadded.step, dstPadded.data, dstPadded.step);
+                break;
+            }
+            case CV_16S:
+            {
+                std::vector<int16_t> tmpImage(width * (paddedHeight + kernelXSize - 1));
+                switch (kernelXSize)
+                {
+                    case 9:
+                    {
+                        fcvFilterCorrSep9x9s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 11:
+                    {
+                        fcvFilterCorrSep11x11s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 13:
+                    {
+                        fcvFilterCorrSep13x13s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 15:
+                    {
+                        fcvFilterCorrSep15x15s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                    case 17:
+                    {
+                        fcvFilterCorrSep17x17s16_v2((int16_t*)kernelX.data, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+
+                    default:
+                    {
+                        fcvFilterCorrSepNxNs16((int16_t*)kernelX.data, kernelXSize, (int16_t*)srcPadded.data, width, paddedHeight,
+                            srcPadded.step, tmpImage.data(), (int16_t*)dstPadded.data, dstPadded.step);
+                        break;
+                    }
+                }
+                break;
+            }
+            default:
+            {
+                CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(dst.depth())));
+                break;
+            }
+        }
+
+        // Only copy center part back to output image and ignore the padded lines
+        Mat temp1 = dstPadded(Rect(0, topLines, width, rangeHeight));
+        Mat temp2 = dst(Rect(0, range.start, width, rangeHeight));
+        temp1.copyTo(temp2);
+    }
+
+    private:
+    const Mat&  src;
+    Mat&        dst;
+    int         width;
+    int         height;
+    const Mat&  kernelX;
+    const Mat&  kernelY;
+    int         kernelXSize;
+    int         kernelYSize;
+    int         halfKsize;
+
+    FcvSepFilter2DLoop_Invoker(const FcvSepFilter2DLoop_Invoker &);  // = delete;
+    const FcvSepFilter2DLoop_Invoker& operator= (const FcvSepFilter2DLoop_Invoker &);  // = delete;
+};
+
+void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && (_src.type() == CV_8UC1 || _src.type() == CV_16SC1));
+    _dst.create(_src.size(), ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    Mat kernelX = _kernelX.getMat();
+    Mat kernelY = _kernelY.getMat();
+
+    int nThreads = getNumThreads();
+    int nStripes = (nThreads > 1) ? ((src.rows > 60) ? 3 * nThreads : 1) : 1;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        {
+            cv::parallel_for_(cv::Range(0, src.rows), FcvSepFilter2DLoop_Invoker(src, dst, kernelX, kernelY), nStripes);
+            break;
+        }
+        case CV_16S:
+        {
+            CV_Assert(CV_MAT_DEPTH(src.type()) == CV_16S);
+            CV_Assert(kernelX.size() == kernelY.size());
+            // kernalX and kernelY shhould be same.
+            Mat diff;
+            absdiff(kernelX, kernelY, diff);
+            CV_Assert(countNonZero(diff) == 0);
+
+            cv::parallel_for_(cv::Range(0, src.rows), FcvSepFilter2DLoop_Invoker(src, dst, kernelX, kernelY), nStripes);
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Dst type:%s is not supported", depthToString(ddepth)));
+            break;
+        }
+    }
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/edges.cpp b/modules/fastcv/src/edges.cpp
new file mode 100644
index 00000000000..ad90b9e71ee
--- /dev/null
+++ b/modules/fastcv/src/edges.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void sobel3x3u8(cv::InputArray _src, cv::OutputArray _dst, cv::OutputArray _dsty, int ddepth, bool normalization)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+
+    Size size = _src.size();
+    _dst.create(size, ddepth);
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    if (_dsty.needed())
+    {
+        _dsty.create(size, ddepth);
+        Mat dsty = _dsty.getMat();
+
+        switch(ddepth)
+        {
+            case CV_8S:
+                if (normalization)
+                    fcvImageGradientSobelPlanars8_v2(src.data, src.cols, src.rows, src.step, (int8_t*)dst.data,
+                        (int8_t*)dsty.data, dst.step);
+                else
+                    CV_Error(cv::Error::StsBadArg,
+                        cv::format("Depth: %d should do normalization, make sure the normalization parameter is true", ddepth));
+                break;
+            case CV_16S:
+                if (normalization)
+                    fcvImageGradientSobelPlanars16_v2(src.data, src.cols, src.rows, src.step, (int16_t*)dst.data,
+                        (int16_t*)dsty.data, dst.step);
+                else
+                    fcvImageGradientSobelPlanars16_v3(src.data, src.cols, src.rows, src.step, (int16_t*)dst.data,
+                        (int16_t*)dsty.data, dst.step);
+                break;
+            case CV_32F:
+                if (normalization)
+                    fcvImageGradientSobelPlanarf32_v2(src.data, src.cols, src.rows, src.step, (float32_t*)dst.data,
+                        (float32_t*)dsty.data, dst.step);
+                else
+                    fcvImageGradientSobelPlanarf32_v3(src.data, src.cols, src.rows, src.step, (float32_t*)dst.data,
+                        (float32_t*)dsty.data, dst.step);
+                break;
+            default:
+                CV_Error(cv::Error::StsBadArg, cv::format("depth: %d is not supported", ddepth));
+                break;
+        }
+    }
+    else
+    {
+        fcvFilterSobel3x3u8_v2(src.data, src.cols, src.rows, src.step, dst.data, dst.step);
+    }
+}
+
+void sobel(cv::InputArray _src, cv::OutputArray _dx, cv::OutputArray _dy, int kernel_size, int borderType, int borderValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    Size size = _src.size();
+    _dx.create( size, CV_16SC1);
+    _dy.create( size, CV_16SC1);
+
+    Mat src = _src.getMat();
+    Mat dx = _dx.getMat();
+    Mat dy = _dy.getMat();
+    fcvStatus status = FASTCV_SUCCESS;
+
+    fcvBorderType   fcvBorder;
+
+    switch (borderType)
+    {
+        case cv::BorderTypes::BORDER_CONSTANT:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_CONSTANT;
+            break;
+        }
+        case cv::BorderTypes::BORDER_REPLICATE:
+        {
+            fcvBorder = fcvBorderType::FASTCV_BORDER_REPLICATE;
+            break;
+        }
+        default:
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Border type: %d is not supported", borderType));
+           break;
+        }
+    }
+
+    switch (kernel_size)
+    {
+        case 3:
+            status = fcvFilterSobel3x3u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        case 5:
+            status = fcvFilterSobel5x5u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        case 7:
+            status = fcvFilterSobel7x7u8s16(src.data, src.cols, src.rows, src.step, (int16_t*)dx.data, (int16_t*)dy.data,
+                dx.step, fcvBorder, borderValue);
+            break;
+        default:
+            CV_Error(cv::Error::StsBadArg, cv::format("Kernel size %d is not supported", kernel_size));
+            break;
+    }
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/ipptransform.cpp b/modules/fastcv/src/ipptransform.cpp
new file mode 100644
index 00000000000..d5bfb259074
--- /dev/null
+++ b/modules/fastcv/src/ipptransform.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void DCT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_16SC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvDCTu8(src.data, src.cols, src.rows, src.step, (short*)dst.data, dst.step);
+}
+
+void IDCT(InputArray _src, OutputArray _dst)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src.empty() && _src.type() == CV_16SC1);
+    CV_Assert(_src.cols() % 8 == 0);
+    CV_Assert(_src.step() % 8 == 0);
+
+    Mat src = _src.getMat();
+
+    _dst.create(_src.rows(), _src.cols(), CV_8UC1);
+    // in case of fixed layout array we cannot fix this on our side, can only fail if false
+    CV_Assert(_dst.step() % 8 == 0);
+
+    Mat dst = _dst.getMat();
+
+    fcvIDCTs16((const short*)src.data, src.cols, src.rows, src.step, dst.data, dst.step);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/mser.cpp b/modules/fastcv/src/mser.cpp
index ae8519313be..6919099a482 100644
--- a/modules/fastcv/src/mser.cpp
+++ b/modules/fastcv/src/mser.cpp
@@ -8,56 +8,109 @@
 namespace cv {
 namespace fastcv {
 
-static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                    std::vector<ContourData>& contourData,
-                    bool useBoundingBoxes = true,
-                    bool useContourData = true,
-                    unsigned int numNeighbors = 4,
-                    unsigned int delta = 2,
-                    unsigned int minArea = 30,
-                    unsigned int maxArea = 14400,
-                    float        maxVariation = 0.15f,
-                    float        minDiversity = 0.2f)
+class MSER_Impl CV_FINAL : public cv::fastcv::FCVMSER
 {
-    INITIALIZATION_CHECK;
+public:
+    explicit MSER_Impl(cv::Size     imgSize,
+                       uint32_t numNeighbors,
+                       uint32_t delta,
+                       uint32_t minArea,
+                       uint32_t maxArea,
+                       float        maxVariation,
+                       float        minDiversity);
 
-    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
-    CV_Assert(_src.cols() > 50);
-    CV_Assert(_src.rows() > 5);
+    ~MSER_Impl() CV_OVERRIDE;
 
-    Mat src = _src.getMat();
+    cv::Size     getImgSize()      CV_OVERRIDE { return imgSize;      };
+    uint32_t getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
+    uint32_t getDelta()        CV_OVERRIDE { return delta;        };
+    uint32_t getMinArea()      CV_OVERRIDE { return minArea;      };
+    uint32_t getMaxArea()      CV_OVERRIDE { return maxArea;      };
+    float        getMaxVariation() CV_OVERRIDE { return maxVariation; };
+    float        getMinDiversity() CV_OVERRIDE { return minDiversity; };
 
-    CV_Assert(numNeighbors == 4 || numNeighbors == 8);
-    bool useNN4 = (numNeighbors == 4);
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours) CV_OVERRIDE;
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) CV_OVERRIDE;
+    void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                                std::vector<ContourData>& contourData) CV_OVERRIDE;
 
-    bool usePointsArray = !useNN4;
+    void detectRegions(InputArray src,
+                       std::vector<std::vector<Point>>& contours,
+                       std::vector<cv::Rect>& boundingBoxes,
+                       std::vector<ContourData>& contourData,
+                       bool useBoundingBoxes = true,
+                       bool useContourData = true);
+
+    cv::Size imgSize;
+    uint32_t numNeighbors;
+    uint32_t delta;
+    uint32_t minArea;
+    uint32_t maxArea;
+    float        maxVariation;
+    float        minDiversity;
 
     void *mserHandle;
+};
 
-    bool isInitOk = false;
-    if (useNN4)
-    {
-        isInitOk = fcvMserInit(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
-    }
-    else
-    {
-        isInitOk = fcvMserNN8Init(src.cols, src.rows, delta, minArea, maxArea, maxVariation, minDiversity, &mserHandle);
-    }
 
-    if (!isInitOk)
+MSER_Impl::MSER_Impl(cv::Size     _imgSize,
+                     uint32_t _numNeighbors,
+                     uint32_t _delta,
+                     uint32_t _minArea,
+                     uint32_t _maxArea,
+                     float        _maxVariation,
+                     float        _minDiversity)
+{
+    CV_Assert(_imgSize.width > 50);
+    CV_Assert(_imgSize.height > 5);
+
+    CV_Assert(_numNeighbors == 4 || _numNeighbors == 8);
+
+    INITIALIZATION_CHECK;
+
+    this->imgSize       = _imgSize;
+    this->numNeighbors  = _numNeighbors;
+    this->delta         = _delta;
+    this->minArea       = _minArea;
+    this->maxArea       = _maxArea;
+    this->maxVariation  = _maxVariation;
+    this->minDiversity  = _minDiversity;
+
+    auto initFunc = (this->numNeighbors == 4) ? fcvMserInit : fcvMserNN8Init;
+
+    if (!initFunc(this->imgSize.width, this->imgSize.height, this->delta, this->minArea, this->maxArea,
+                  this->maxVariation, this->minDiversity, &this->mserHandle))
     {
         CV_Error(cv::Error::StsInternal, "Failed to initialize MSER");
     }
+}
+
+
+MSER_Impl::~MSER_Impl()
+{
+    fcvMserRelease(mserHandle);
+}
+
+
+void MSER_Impl::detectRegions(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                              std::vector<ContourData>& contourData, bool useBoundingBoxes, bool useContourData)
+{
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(_src.size() == this->imgSize);
+
+    Mat src = _src.getMat();
+
+    bool usePointsArray = (this->numNeighbors == 8);
 
     //bufSize for pts and bboxes
-    const unsigned int maxContours = 16384;
-    unsigned int numContours;
+    const uint32_t maxContours = 16384;
+    uint32_t numContours;
     std::vector<uint32_t> numPointsInContour(maxContours);
 
     std::vector<uint16_t> rectArray;
     rectArray.resize(4 * maxContours); // xMin, xMax, yMax, yMin
 
-    unsigned int pointsArraySize = src.total() * 30; // Recommended typical size
+    uint32_t pointsArraySize = src.total() * 30; // Recommended typical size
     std::vector<uint16_t> pointsArray;
     std::vector<uint32_t> contourStartingPoints;
     uint32_t pathArraySize = src.total() * 4; // Recommended size
@@ -76,7 +129,7 @@ static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours,
     std::vector<int8_t> contourPolarity(maxContours);
 
     int mserRetcode = -1;
-    if (useNN4)
+    if (this->numNeighbors == 4)
     {
         mserRetcode = fcvMserExtu8_v3(mserHandle, src.data, src.cols, src.rows, src.step,
                                       maxContours, &numContours,
@@ -170,33 +223,37 @@ static void runMSER(InputArray _src, std::vector<std::vector<Point>>& contours,
             contourData.push_back(data);
         }
     }
-
-    fcvMserRelease(mserHandle);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>> &contours,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>> &contours)
 {
     std::vector<cv::Rect> boundingBoxes;
     std::vector<ContourData> contourData;
-    runMSER(_src, contours, boundingBoxes, contourData, false, false, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ false, /*useContourData*/ false);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes)
 {
     std::vector<ContourData> contourData;
-    runMSER(_src, contours, boundingBoxes, contourData, true, false, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ false);
+}
+
+void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
+                       std::vector<ContourData>& contourData)
+{
+    this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ true);
 }
 
-void MSER(InputArray _src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes, std::vector<ContourData>& contourData,
-          unsigned int numNeighbors, unsigned int delta, unsigned int minArea, unsigned int maxArea, float maxVariation, float minDiversity)
+Ptr<FCVMSER> FCVMSER::create(cv::Size     imgSize,
+                             uint32_t numNeighbors,
+                             uint32_t delta,
+                             uint32_t minArea,
+                             uint32_t maxArea,
+                             float        maxVariation,
+                             float        minDiversity)
 {
-    runMSER(_src, contours, boundingBoxes, contourData, true, true, numNeighbors,
-            delta, minArea, maxArea, maxVariation, minDiversity);
+    return makePtr<MSER_Impl>(imgSize, numNeighbors, delta, minArea, maxArea, maxVariation, minDiversity);
 }
 
 } // fastcv::
-} // cv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/precomp.hpp b/modules/fastcv/src/precomp.hpp
index d33cb25bafb..c2929d76cc1 100644
--- a/modules/fastcv/src/precomp.hpp
+++ b/modules/fastcv/src/precomp.hpp
@@ -28,6 +28,9 @@ namespace fastcv {
     CV_INSTRUMENT_REGION();                                                 \
 }
 
+#define FCV_KernelSize_SHIFT 3
+#define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
+
 const std::map<fcvStatus, std::string> fcvStatusStrings =
 {
     { FASTCV_SUCCESS,       "Success"},
diff --git a/modules/fastcv/src/pyramid.cpp b/modules/fastcv/src/pyramid.cpp
new file mode 100644
index 00000000000..806c8e9970f
--- /dev/null
+++ b/modules/fastcv/src/pyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void sobelPyramid(InputArrayOfArrays _pyr, OutputArrayOfArrays _dx, OutputArrayOfArrays _dy, int outType)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(_pyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dx.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dx.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dx.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dy.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dy.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dy.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+    std::vector<cv::Mat> pyr;
+    _pyr.getMatVector(pyr);
+    size_t nLevels = pyr.size();
+
+    CV_Assert(!pyr.empty());
+
+    // this should be smaller I guess
+    CV_Assert(nLevels > 0 && nLevels < 16);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        // fcvPyramidLeved does not support other cases
+        CV_Assert(pyr[i].isContinuous());
+        CV_Assert(pyr[i].type() == CV_8UC1);
+    }
+
+    CV_Assert(outType == CV_8S || outType == CV_16S || outType == CV_32F);
+
+    std::vector<fcvPyramidLevel> lpyr;
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        fcvPyramidLevel lev;
+        lev.width  = pyr[i].cols;
+        lev.height = pyr[i].rows;
+        lev.ptr    = pyr[i].data;
+        lpyr.push_back(lev);
+    }
+
+    std::vector<fcvPyramidLevel> ldx(nLevels), ldy(nLevels);
+    int pyrElemSz = (outType == CV_8S ) ? 1 :
+                    (outType == CV_16S) ? 2 :
+                    (outType == CV_32F) ? 4 : 0;
+    int retCodex = fcvPyramidAllocate(ldx.data(), pyr[0].cols, pyr[0].rows, pyrElemSz, nLevels, 1);
+    if (retCodex != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("fcvPyramidAllocate returned code %d", retCodex));
+    }
+    int retCodey = fcvPyramidAllocate(ldy.data(), pyr[0].cols, pyr[0].rows, pyrElemSz, nLevels, 1);
+    if (retCodey != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("fcvPyramidAllocate returned code %d", retCodey));
+    }
+
+    int returnCode = -1;
+    switch (outType)
+    {
+    case CV_8S:  returnCode = fcvPyramidSobelGradientCreatei8 (lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    case CV_16S: returnCode = fcvPyramidSobelGradientCreatei16(lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    case CV_32F: returnCode = fcvPyramidSobelGradientCreatef32(lpyr.data(), ldx.data(), ldy.data(), nLevels);
+        break;
+    default:
+        break;
+    }
+
+    if (returnCode != 0)
+    {
+        CV_Error(cv::Error::StsInternal, cv::format("FastCV returned code %d", returnCode));
+    }
+
+    // resize arrays of Mats
+    _dx.create(1, nLevels, /* type does not matter here */ -1, -1);
+    _dy.create(1, nLevels, /* type does not matter here */ -1, -1);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        cv::Mat dx((int)ldx[i].height, (int)ldx[i].width, outType, (uchar*)ldx[i].ptr);
+        _dx.create(pyr[i].size(), outType, i);
+        dx.copyTo(_dx.getMat(i));
+
+        cv::Mat dy((int)ldy[i].height, (int)ldy[i].width, outType, (uchar*)ldy[i].ptr);
+        _dy.create(pyr[i].size(), outType, i);
+        dy.copyTo(_dy.getMat(i));
+    }
+
+    fcvPyramidDelete(ldx.data(), nLevels, 0);
+    fcvPyramidDelete(ldy.data(), nLevels, 0);
+}
+
+
+void buildPyramid(InputArray _src, OutputArrayOfArrays _pyr, int nLevels, bool scaleBy2, int borderType, uint8_t borderValue)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty() && (_src.type() == CV_8UC1 || _src.type() == CV_32FC1));
+    CV_Assert(_src.step() % 8 == 0);
+
+    cv::Mat src = _src.getMat();
+    bool useFloat = src.depth() == CV_32F;
+    int bytesPerPixel = useFloat ? 4 : 1;
+
+    CV_Assert(_pyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _pyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+    // this should be smaller I guess
+    CV_Assert(nLevels > 0 && nLevels < 16);
+
+    if (useFloat && !scaleBy2)
+    {
+        CV_Error( cv::Error::StsBadArg, "ORB scale is not supported for float images (fcvPyramidCreatef32_v2)");
+    }
+
+    fcvPyramidScale scaleOption = scaleBy2 ? FASTCV_PYRAMID_SCALE_HALF : FASTCV_PYRAMID_SCALE_ORB;
+    fcvBorderType borderOption;
+    switch (borderType)
+    {
+    case cv::BORDER_REFLECT:     borderOption = FASTCV_BORDER_REFLECT;    break;
+    case cv::BORDER_REFLECT_101: borderOption = FASTCV_BORDER_REFLECT_V2; break;
+    case cv::BORDER_REPLICATE:   borderOption = FASTCV_BORDER_REPLICATE;  break;
+    default:                     borderOption = FASTCV_BORDER_UNDEFINED;  break;
+    }
+
+    std::vector<fcvPyramidLevel_v2> lpyrSrc2(nLevels);
+
+    int alignment = 8;
+    if (useFloat)
+    {
+        // use version 2
+        CV_Assert(fcvPyramidAllocate_v2(lpyrSrc2.data(), src.cols, src.rows, src.step, bytesPerPixel, nLevels, 0) == 0);
+        CV_Assert(fcvPyramidCreatef32_v2((const float*)src.data, src.cols, src.rows, src.step, nLevels, lpyrSrc2.data()) == 0);
+    }
+    else
+    {
+        // use version 4
+        fcvStatus statusAlloc = fcvPyramidAllocate_v3(lpyrSrc2.data(), src.cols, src.rows, src.step,
+                                                      bytesPerPixel, alignment, nLevels, scaleOption, 0);
+        if (statusAlloc != FASTCV_SUCCESS)
+        {
+            std::string s = fcvStatusStrings.count(statusAlloc) ? fcvStatusStrings.at(statusAlloc) : "unknown";
+            CV_Error( cv::Error::StsInternal, "fcvPyramidAllocate_v3 error: " + s);
+        }
+
+        fcvStatus statusPyr = fcvPyramidCreateu8_v4(src.data, src.cols, src.rows, src.step, nLevels, scaleOption,
+                                                    lpyrSrc2.data(), borderOption, borderValue);
+        if (statusPyr != FASTCV_SUCCESS)
+        {
+            std::string s = fcvStatusStrings.count(statusPyr) ? fcvStatusStrings.at(statusPyr) : "unknown";
+            CV_Error( cv::Error::StsInternal, "fcvPyramidCreateu8_v4 error: " + s);
+        }
+    }
+
+    // create vector
+    _pyr.create(nLevels, 1, src.type(), -1);
+    for (int i = 0; i < nLevels; i++)
+    {
+        cv::Mat m = cv::Mat((uint32_t)lpyrSrc2[i].height, (uint32_t)lpyrSrc2[i].width,
+                             src.type(), (void*)lpyrSrc2[i].ptr, (size_t)lpyrSrc2[i].stride);
+
+        _pyr.create(m.size(), m.type(), i);
+        m.copyTo(_pyr.getMat(i));
+    }
+
+    fcvPyramidDelete_v2(lpyrSrc2.data(), nLevels, 1);
+}
+
+} // namespace fastcv
+} // namespace cv
diff --git a/modules/fastcv/src/remap.cpp b/modules/fastcv/src/remap.cpp
index a0b4849ac72..933bfdc4273 100644
--- a/modules/fastcv/src/remap.cpp
+++ b/modules/fastcv/src/remap.cpp
@@ -10,8 +10,8 @@ namespace fastcv {
 
 class RemapParallel : public cv::ParallelLoopBody {
 public:
-    RemapParallel(int src_type, const uint8_t* src, unsigned int srcWidth, unsigned int srcHeight, unsigned int srcStride, uint8_t* dst,
-                unsigned int dstWidth, unsigned int dstHeight, unsigned int dstStride, const float32_t* __restrict  mapX,
+    RemapParallel(int src_type, const uint8_t* src, uint32_t srcWidth, uint32_t srcHeight, uint32_t srcStride, uint8_t* dst,
+                uint32_t dstWidth, uint32_t dstHeight, uint32_t dstStride, const float32_t* __restrict  mapX,
                 const float32_t* __restrict mapY, uint32_t mapStride, fcvInterpolationType interpolation, uint8_t borderValue)
                 : src_type_(src_type), src_(src), srcWidth_(srcWidth), srcHeight_(srcHeight), srcStride_(srcStride), dst_(dst), dstWidth_(dstWidth),
                 dstHeight_(dstHeight), dstStride_(dstStride), mapX_(mapX), mapY_(mapY), mapStride_(mapStride),
@@ -43,7 +43,7 @@ class RemapParallel : public cv::ParallelLoopBody {
 
         if(status!=FASTCV_SUCCESS)
         {
-			std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+            std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
             CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
         }
     }
@@ -51,16 +51,16 @@ class RemapParallel : public cv::ParallelLoopBody {
 private:
     int src_type_;
     const uint8_t* src_;
-    unsigned int srcWidth_;
-    unsigned int srcHeight_;
-    unsigned int srcStride_;
+    uint32_t srcWidth_;
+    uint32_t srcHeight_;
+    uint32_t srcStride_;
     uint8_t* dst_;
-    unsigned int dstWidth_;
-    unsigned int dstHeight_;
-    unsigned int dstStride_;
+    uint32_t dstWidth_;
+    uint32_t dstHeight_;
+    uint32_t dstStride_;
     const float32_t* __restrict mapX_;
     const float32_t* __restrict mapY_;
-    unsigned int mapStride_;
+    uint32_t mapStride_;
     fcvInterpolationType fcvInterpolation_;
     uint8_t borderValue_;
 };
diff --git a/modules/fastcv/src/tracking.cpp b/modules/fastcv/src/tracking.cpp
new file mode 100644
index 00000000000..778c73c323e
--- /dev/null
+++ b/modules/fastcv/src/tracking.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+static void trackOpticalFlowLKInternal(InputArray _src, InputArray _dst,
+                                       InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                                       InputArrayOfArrays _srcDxPyr, InputArrayOfArrays _srcDyPyr,
+                                       InputArray _ptsIn, OutputArray _ptsOut, InputArray _ptsEst,
+                                       OutputArray _statusVec, cv::Size winSize,
+                                       cv::TermCriteria termCriteria)
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(winSize.width % 2 == 1 && winSize.height % 2 == 1);
+
+    CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
+    CV_Assert(!_dst.empty() && _dst.type() == CV_8UC1);
+    CV_Assert(_src.size() == _dst.size());
+    CV_Assert(_src.step() % 8 == 0);
+    CV_Assert(_dst.step() == _src.step());
+
+    cv::Mat src = _src.getMat(), dst = _dst.getMat();
+
+    CV_Assert(_srcPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _srcPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _srcPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_dstPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+              _dstPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+              _dstPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+    CV_Assert(_srcPyr.size() == _dstPyr.size());
+
+    int nLevels = _srcPyr.size().area();
+
+    std::vector<cv::Mat> srcPyr, dstPyr;
+    _srcPyr.getMatVector(srcPyr);
+    _dstPyr.getMatVector(dstPyr);
+
+    cv::Size imSz = src.size();
+    for (int i = 0; i < nLevels; i++)
+    {
+        const cv::Mat& s = srcPyr[i];
+        const cv::Mat& d = dstPyr[i];
+
+        CV_Assert(!s.empty() && s.type() == CV_8UC1);
+        CV_Assert(!d.empty() && d.type() == CV_8UC1);
+        CV_Assert(s.size() == imSz);
+        CV_Assert(d.size() == imSz);
+
+        imSz.width /= 2; imSz.height /= 2;
+    }
+
+    bool useDxDy = !_srcDxPyr.empty() && !_srcDyPyr.empty();
+    int version = useDxDy ? 1 : 3;
+
+    std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+    if (version == 1)
+    {
+        CV_Assert(_srcDxPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+                  _srcDxPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+                  _srcDxPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+        CV_Assert(_srcDyPyr.kind() == _InputArray::KindFlag::STD_ARRAY_MAT ||
+                  _srcDyPyr.kind() == _InputArray::KindFlag::STD_VECTOR_MAT ||
+                  _srcDyPyr.kind() == _InputArray::KindFlag::STD_VECTOR_UMAT);
+
+        CV_Assert(_srcDxPyr.size() == _srcDyPyr.size());
+        _srcDxPyr.getMatVector(srcDxPyr);
+        _srcDyPyr.getMatVector(srcDyPyr);
+
+        imSz = src.size();
+        for (int i = 0; i < nLevels; i++)
+        {
+            const cv::Mat& dx = srcDxPyr[i];
+            const cv::Mat& dy = srcDyPyr[i];
+
+            CV_Assert(!dx.empty() && dx.type() == CV_8SC1);
+            CV_Assert(!dy.empty() && dy.type() == CV_8SC1);
+            CV_Assert(dx.size() == imSz);
+            CV_Assert(dy.size() == imSz);
+
+            imSz.width /= 2; imSz.height /= 2;
+        }
+    }
+
+    std::vector<fcvPyramidLevel> lpyrSrc1, lpyrDst1, lpyrDxSrc, lpyrDySrc;
+    std::vector<fcvPyramidLevel_v2> lpyrSrc2, lpyrDst2;
+    for (int i = 0; i < nLevels; i++)
+    {
+        fcvPyramidLevel lsrc1, ldst1;
+        fcvPyramidLevel_v2 lsrc2, ldst2;
+        lsrc1.width  = srcPyr[i].cols;
+        lsrc1.height = srcPyr[i].rows;
+        lsrc1.ptr    = srcPyr[i].data;
+
+        lsrc2.width  = srcPyr[i].cols;
+        lsrc2.height = srcPyr[i].rows;
+        lsrc2.stride = srcPyr[i].step;
+        lsrc2.ptr    = srcPyr[i].data;
+
+        ldst1.width  = dstPyr[i].cols;
+        ldst1.height = dstPyr[i].rows;
+        ldst1.ptr    = dstPyr[i].data;
+        ldst2.width  = dstPyr[i].cols;
+        ldst2.height = dstPyr[i].rows;
+        ldst2.stride = dstPyr[i].step;
+        ldst2.ptr    = dstPyr[i].data;
+        lpyrSrc1.push_back(lsrc1); lpyrDst1.push_back(ldst1);
+        lpyrSrc2.push_back(lsrc2); lpyrDst2.push_back(ldst2);
+
+        if (version == 1)
+        {
+            fcvPyramidLevel ldx, ldy;
+            CV_Assert(srcDxPyr[i].isContinuous());
+            ldx.width  = srcDxPyr[i].cols;
+            ldx.height = srcDxPyr[i].rows;
+            ldx.ptr    = srcDxPyr[i].data;
+            CV_Assert(srcDyPyr[i].isContinuous());
+            ldy.width  = srcDyPyr[i].cols;
+            ldy.height = srcDyPyr[i].rows;
+            ldy.ptr    = srcDyPyr[i].data;
+            lpyrDxSrc.push_back(ldx); lpyrDySrc.push_back(ldy);
+        }
+    }
+
+    CV_Assert(!_ptsIn.empty() && (_ptsIn.type() == CV_32FC1 || _ptsIn.type() == CV_32FC2));
+    CV_Assert(_ptsIn.isContinuous());
+    CV_Assert(_ptsIn.total() * _ptsIn.channels() % 2 == 0);
+
+    cv::Mat ptsIn = _ptsIn.getMat();
+    int nPts = ptsIn.total() * ptsIn.channels() / 2;
+
+    bool useInitialEstimate;
+    cv::Mat ptsEst;
+    const float32_t* ptsEstData;
+    if (!_ptsEst.empty())
+    {
+        CV_Assert(_ptsEst.type() == CV_32FC1 || _ptsEst.type() == CV_32FC2);
+        CV_Assert(_ptsEst.isContinuous());
+        int estElems = _ptsEst.total() * _ptsEst.channels();
+        CV_Assert(estElems % 2 == 0);
+        CV_Assert(estElems / 2 == nPts);
+
+        ptsEst = _ptsEst.getMat();
+        ptsEstData = (const float32_t*)ptsEst.data;
+        useInitialEstimate = true;
+    }
+    else
+    {
+        useInitialEstimate = false;
+        ptsEstData = (const float32_t*)ptsIn.data;
+    }
+
+    CV_Assert(_ptsOut.needed());
+    _ptsOut.create(1, nPts, CV_32FC2);
+    cv::Mat ptsOut = _ptsOut.getMat();
+
+    cv::Mat statusVec;
+    if (!_statusVec.empty())
+    {
+        _statusVec.create(1, nPts, CV_32SC1);
+        statusVec = _statusVec.getMat();
+    }
+    else
+    {
+        statusVec = cv::Mat(1, nPts, CV_32SC1);
+    }
+
+    fcvTerminationCriteria termCrit;
+    if (termCriteria.type & cv::TermCriteria::COUNT)
+    {
+        if (termCriteria.type & cv::TermCriteria::EPS)
+        {
+            termCrit = FASTCV_TERM_CRITERIA_BOTH;
+        }
+        else
+        {
+            termCrit = FASTCV_TERM_CRITERIA_ITERATIONS;
+        }
+    }
+    else
+    {
+        if (termCriteria.type & cv::TermCriteria::EPS)
+        {
+            termCrit = FASTCV_TERM_CRITERIA_EPSILON;
+        }
+        else
+        {
+            CV_Error(cv::Error::StsBadArg, "Incorrect termination criteria");
+        }
+    }
+    int maxIterations = termCriteria.maxCount;
+    double maxEpsilon = termCriteria.epsilon;
+
+    fcvStatus status = FASTCV_SUCCESS;
+
+    if (version == 3)
+    {
+        status = fcvTrackLKOpticalFlowu8_v3(src.data, dst.data, src.cols, src.rows, src.step,
+                                            lpyrSrc2.data(), lpyrDst2.data(),
+                                            (const float32_t*)ptsIn.data,
+                                            ptsEstData,
+                                            (float32_t*)ptsOut.data,
+                                            (int32_t*)statusVec.data,
+                                            nPts,
+                                            winSize.width, winSize.height,
+                                            nLevels,
+                                            termCrit, maxIterations, maxEpsilon,
+                                            useInitialEstimate);
+    }
+    else // if (version == 1)
+    {
+        CV_Assert(src.isContinuous() && dst.isContinuous());
+        // Obsolete parameters, set to 0
+        float maxResidue = 0, minDisplacement = 0, minEigenvalue = 0;
+        int lightingNormalized = 0;
+        fcvTrackLKOpticalFlowu8(src.data, dst.data, src.cols, src.rows,
+                                lpyrSrc1.data(), lpyrDst1.data(),
+                                lpyrDxSrc.data(), lpyrDySrc.data(),
+                                (const float32_t*)ptsIn.data,
+                                (float32_t*)ptsOut.data,
+                                (int32_t*)statusVec.data,
+                                nPts,
+                                winSize.width, winSize.height,
+                                maxIterations,
+                                nLevels,
+                                maxResidue, minDisplacement, minEigenvalue, lightingNormalized);
+    }
+
+    if (status != FASTCV_SUCCESS)
+    {
+        std::string s = fcvStatusStrings.count(status) ? fcvStatusStrings.at(status) : "unknown";
+        CV_Error( cv::Error::StsInternal, "FastCV error: " + s);
+    }
+}
+
+
+void trackOpticalFlowLK(InputArray _src, InputArray _dst,
+                        InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                        InputArray _ptsIn, OutputArray _ptsOut, InputArray _ptsEst,
+                        OutputArray _statusVec, cv::Size winSize,
+                        cv::TermCriteria termCriteria)
+{
+    trackOpticalFlowLKInternal(_src, _dst, _srcPyr, _dstPyr, noArray(), noArray(),
+                               _ptsIn, _ptsOut, _ptsEst,
+                               _statusVec, winSize,
+                               termCriteria);
+}
+
+void trackOpticalFlowLK(InputArray _src, InputArray _dst,
+                        InputArrayOfArrays _srcPyr, InputArrayOfArrays _dstPyr,
+                        InputArrayOfArrays _srcDxPyr, InputArrayOfArrays _srcDyPyr,
+                        InputArray _ptsIn, OutputArray _ptsOut,
+                        OutputArray _statusVec, cv::Size winSize, int maxIterations)
+{
+    trackOpticalFlowLKInternal(_src, _dst, _srcPyr, _dstPyr,
+                               _srcDxPyr, _srcDyPyr,
+                               _ptsIn, _ptsOut, cv::noArray(),
+                               _statusVec, winSize,
+                               {cv::TermCriteria::MAX_ITER | cv::TermCriteria::EPS,
+                                maxIterations, /* maxEpsilon */ 0.03f * 0.03f});
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/warp.cpp b/modules/fastcv/src/warp.cpp
new file mode 100644
index 00000000000..01f83bdf510
--- /dev/null
+++ b/modules/fastcv/src/warp.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvWarpPerspectiveLoop_Invoker : public cv::ParallelLoopBody
+{
+    public:
+
+    FcvWarpPerspectiveLoop_Invoker(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2, InputArray _M0,
+        Size _dsize) : cv::ParallelLoopBody()
+    {
+        src1 = _src1.getMat();
+        src2 = _src2.getMat();
+        dsize = _dsize;
+
+        _dst1.create(dsize, src1.type());
+        _dst2.create(dsize, src2.type());
+        dst1 = _dst1.getMat();
+        dst2 = _dst2.getMat();
+
+        M = _M0.getMat();
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        uchar* dst1_ptr = dst1.data + range.start*dst1.step;
+        uchar* dst2_ptr = dst2.data + range.start*dst2.step;
+        int rangeHeight = range.end - range.start;
+
+        float rangeMatrix[9];
+        rangeMatrix[0] = M.at<float>(0,0);
+        rangeMatrix[1] = M.at<float>(0,1);
+        rangeMatrix[2] = M.at<float>(0,2)+range.start*M.at<float>(0,1);
+        rangeMatrix[3] = M.at<float>(1,0);
+        rangeMatrix[4] = M.at<float>(1,1);
+        rangeMatrix[5] = M.at<float>(1,2)+range.start*M.at<float>(1,1);
+        rangeMatrix[6] = M.at<float>(2,0);
+        rangeMatrix[7] = M.at<float>(2,1);
+        rangeMatrix[8] = M.at<float>(2,2)+range.start*M.at<float>(2,1);
+
+        fcv2PlaneWarpPerspectiveu8(src1.data, src2.data, src1.cols, src1.rows, src1.step, src2.step, dst1_ptr, dst2_ptr,
+            dsize.width, rangeHeight, dst1.step, dst2.step, rangeMatrix);
+    }
+
+    private:
+    Mat         src1;
+    Mat         src2;
+    Mat         dst1;
+    Mat         dst2;
+    Mat         M;
+    Size        dsize;
+
+    FcvWarpPerspectiveLoop_Invoker(const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+    const FcvWarpPerspectiveLoop_Invoker& operator= (const FcvWarpPerspectiveLoop_Invoker &);  // = delete;
+};
+
+void warpPerspective2Plane(InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2, InputArray _M0,
+        Size dsize)
+{
+    INITIALIZATION_CHECK;
+    CV_Assert(!_src1.empty() && _src1.type() == CV_8UC1);
+    CV_Assert(!_src2.empty() && _src2.type() == CV_8UC1);
+    CV_Assert(!_M0.empty());
+
+    cv::parallel_for_(cv::Range(0, dsize.height),
+        FcvWarpPerspectiveLoop_Invoker(_src1, _src2, _dst1, _dst2, _M0, dsize), 1);
+}
+
+} // fastcv::
+} // cv::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_bilateral.cpp b/modules/fastcv/test/test_bilateral.cpp
index 4f582c2ed37..5c883801a92 100644
--- a/modules/fastcv/test/test_bilateral.cpp
+++ b/modules/fastcv/test/test_bilateral.cpp
@@ -10,20 +10,20 @@ namespace opencv_test { namespace {
 typedef testing::TestWithParam<tuple<cv::Size,int,int>> fcv_bilateralFilterTest;
 
 TEST_P(fcv_bilateralFilterTest, accuracy)
-{	
+{
     cv::Size size  = get<0>(GetParam());
 	int d = get<1>(GetParam());
     double sigmaColor = get<2>(GetParam());
-	double sigmaSpace = sigmaColor;
-	
-	RNG& rng = cv::theRNG();
+    double sigmaSpace = sigmaColor;
+
+    RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
 
     cv::Mat dst;
 
-	cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
-	
+    cv::fastcv::bilateralFilter(src, dst, d, sigmaColor, sigmaSpace);
+
     EXPECT_FALSE(dst.empty());
 }
 
diff --git a/modules/fastcv/test/test_blur.cpp b/modules/fastcv/test/test_blur.cpp
new file mode 100644
index 00000000000..1dde0261f28
--- /dev/null
+++ b/modules/fastcv/test/test_blur.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<Size, int, int, bool>> GaussianBlurTest;
+
+TEST_P(GaussianBlurTest, accuracy)
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+    int ksize = get<2>(GetParam());
+    bool border = get<3>(GetParam());
+
+    // For some cases FastCV not support, so skip them
+    if((ksize!=5) && (depth!=CV_8U))
+        return;
+
+    cv::Mat src(srcSize, depth);
+    cv::Mat dst,ref;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    cv::fastcv::gaussianBlur(src, dst, ksize, border);
+
+    if(depth == CV_32S)
+        src.convertTo(src, CV_32F);
+    cv::GaussianBlur(src,ref,Size(ksize,ksize),0,0,border);
+    ref.convertTo(ref,depth);
+
+    cv::Mat difference;
+    cv::absdiff(dst, ref, difference);
+
+    int num_diff_pixels = cv::countNonZero(difference);
+
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+typedef testing::TestWithParam<tuple<Size, int, int>> Filter2DTest;
+
+TEST_P(Filter2DTest, accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth   = get<1>(GetParam());
+    int ksize    = get<2>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel;
+    cv::Mat dst, ref;
+
+    switch (ddepth)
+    {
+        case CV_8U:
+        case CV_16S:
+        {
+            kernel.create(ksize,ksize,CV_8S);
+            break;
+        }
+        case CV_32F:
+        {
+            kernel.create(ksize,ksize,CV_32F);
+            break;
+        }
+        default:
+            return;
+    }
+
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    cv::fastcv::filter2D(src, dst, ddepth, kernel);
+    cv::filter2D(src, ref, ddepth, kernel);
+
+    cv::Mat difference;
+    dst.convertTo(dst, CV_8U);
+    ref.convertTo(ref, CV_8U);
+    cv::absdiff(dst, ref, difference);
+
+    int num_diff_pixels = cv::countNonZero(difference);
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+typedef testing::TestWithParam<tuple<Size, int>> SepFilter2DTest;
+
+TEST_P(SepFilter2DTest, accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize    = get<1>(GetParam());
+
+    cv::Mat src(srcSize, CV_8U);
+    cv::Mat kernel(1,ksize,CV_8S);
+    cv::Mat dst,ref;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, kernel, Scalar::all(INT8_MIN), Scalar::all(INT8_MAX));
+
+    cv::fastcv::sepFilter2D(src, dst, CV_8U, kernel, kernel);
+    cv::sepFilter2D(src,ref,CV_8U,kernel,kernel);
+
+    cv::Mat difference;
+    cv::absdiff(dst, ref, difference);
+    int num_diff_pixels = cv::countNonZero(difference);
+    EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, GaussianBlurTest, Combine(
+/*image size*/     ::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*image depth*/    ::testing::Values(CV_8U,CV_16S,CV_32S),
+/*kernel size*/    ::testing::Values(3, 5),
+/*blur border*/    ::testing::Values(true,false)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Filter2DTest, Combine(
+/*image sie*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*dst depth*/      Values(CV_8U,CV_16S,CV_32F),
+/*kernel size*/    Values(3, 5, 7, 9, 11)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, SepFilter2DTest, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*kernel size*/    Values(3, 5, 7, 9, 11)
+));
+
+}} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_edges.cpp b/modules/fastcv/test/test_edges.cpp
new file mode 100644
index 00000000000..e1e1576ef15
--- /dev/null
+++ b/modules/fastcv/test/test_edges.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<tuple<Size, int, int, int>> Sobel;
+typedef testing::TestWithParam<tuple<Size, int>> Sobel3x3u8;
+
+TEST_P(Sobel,accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ksize = get<1>(GetParam());
+    int border = get<2>(GetParam());
+    int borderValue = get<3>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U), refx, refy;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    cv::fastcv::sobel(src, dx, dy, ksize, border, borderValue);
+
+    cv::Sobel(src, refx, CV_16S, 1, 0, ksize, 1.0, 0.0, border);
+    cv::Sobel(src, refy, CV_16S, 0, 1, ksize, 1.0, 0.0, border);
+
+    cv::Mat difference_x, difference_y;
+    cv::absdiff(dx, refx, difference_x);
+    cv::absdiff(dy, refy, difference_y);
+
+    int num_diff_pixels_x = cv::countNonZero(difference_x);
+    int num_diff_pixels_y = cv::countNonZero(difference_y);
+    EXPECT_LT(num_diff_pixels_x, src.size().area()*0.1);
+    EXPECT_LT(num_diff_pixels_y, src.size().area()*0.1);
+}
+
+TEST_P(Sobel3x3u8,accuracy)
+{
+    Size srcSize = get<0>(GetParam());
+    int ddepth = get<1>(GetParam());
+
+    cv::Mat dx, dy, src(srcSize, CV_8U), refx, refy;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    cv::fastcv::sobel3x3u8(src, dx, dy, ddepth, 0);
+    cv::Sobel(src, refx, ddepth, 1, 0);
+    cv::Sobel(src, refy, ddepth, 0, 1);
+
+    cv::Mat difference_x, difference_y;
+    cv::absdiff(dx, refx, difference_x);
+    cv::absdiff(dy, refy, difference_y);
+
+    int num_diff_pixels_x = cv::countNonZero(difference_x);
+    int num_diff_pixels_y = cv::countNonZero(difference_y);
+    EXPECT_LT(num_diff_pixels_x, src.size().area()*0.1);
+    EXPECT_LT(num_diff_pixels_y, src.size().area()*0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Sobel, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*kernel size*/     Values(3,5,7),
+/*border*/          Values(BORDER_CONSTANT, BORDER_REPLICATE),
+/*border value*/    Values(0)
+));
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, Sobel3x3u8, Combine(
+/*image size*/      Values(perf::szVGA, perf::sz720p, perf::sz1080p),
+/*dst depth*/       Values(CV_16S, CV_32F)
+));
+
+}
+}
diff --git a/modules/fastcv/test/test_fft.cpp b/modules/fastcv/test/test_fft.cpp
index 18b53d88ba0..ef70f8e12f5 100644
--- a/modules/fastcv/test/test_fft.cpp
+++ b/modules/fastcv/test/test_fft.cpp
@@ -39,7 +39,6 @@ TEST_P(FFTExtTest, inverse)
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
     cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
-    //cv::Mat src = imread(cvtest::findDataFile("cv/shared/lena.png"), IMREAD_GRAYSCALE);
 
     Mat srcFloat;
     src.convertTo(srcFloat, CV_32F);
diff --git a/modules/fastcv/test/test_ipptransform.cpp b/modules/fastcv/test/test_ipptransform.cpp
new file mode 100644
index 00000000000..66ff8cbd59d
--- /dev/null
+++ b/modules/fastcv/test/test_ipptransform.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+class DCTExtTest : public ::testing::TestWithParam<cv::Size> {};
+
+TEST_P(DCTExtTest, forward)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat dst, ref;
+    cv::fastcv::DCT(src, dst);
+
+    cv::dct(srcFloat, ref);
+
+    Mat dstFloat;
+    ref.convertTo(dstFloat, CV_32F);
+
+    double normInf = cvtest::norm(dstFloat, ref, cv::NORM_INF);
+    double normL2  = cvtest::norm(dstFloat, ref, cv::NORM_L2)  / dst.size().area();
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << "dst:" << std::endl << dst << std::endl;
+        std::cout << "ref:" << std::endl << ref << std::endl;
+    }
+
+    EXPECT_EQ(normInf, 0);
+    EXPECT_EQ(normL2, 0);
+}
+
+TEST_P(DCTExtTest, inverse)
+{
+    Size size = GetParam();
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+
+    Mat srcFloat;
+    src.convertTo(srcFloat, CV_32F);
+
+    Mat fwd, back;
+    cv::fastcv::DCT(src, fwd);
+    cv::fastcv::IDCT(fwd, back);
+    Mat backFloat;
+    back.convertTo(backFloat, CV_32F);
+
+    Mat fwdRef, backRef;
+    cv::dct(srcFloat, fwdRef);
+    cv::idct(fwdRef, backRef);
+
+    double normInf = cvtest::norm(backFloat, backRef, cv::NORM_INF);
+    double normL2  = cvtest::norm(backFloat, backRef, cv::NORM_L2)  / src.size().area();
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << "src:"     << std::endl << src     << std::endl;
+        std::cout << "back:"    << std::endl << back    << std::endl;
+        std::cout << "backRef:" << std::endl << backRef << std::endl;
+    }
+
+    EXPECT_LE(normInf, 7.00005);
+    EXPECT_LT(normL2,  0.13);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, DCTExtTest, ::testing::Values(Size(8, 8), Size(128, 128), Size(32, 256), Size(512, 512)));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_moments.cpp b/modules/fastcv/test/test_moments.cpp
index 1d23156dcf2..d4ef89f98db 100644
--- a/modules/fastcv/test/test_moments.cpp
+++ b/modules/fastcv/test/test_moments.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/moments.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -37,7 +36,7 @@ TEST_P(fcv_momentsTest, accuracy)
 INSTANTIATE_TEST_CASE_P(/*nothing*/, fcv_momentsTest, Combine(
                    Values(false, true),
                    Values(TYPICAL_MAT_SIZES),
-                   Values(CV_8UC1, CV_32SC1, CV_32FC1)			   
+                   Values(CV_8UC1, CV_32SC1, CV_32FC1)
 ));
 
 }
diff --git a/modules/fastcv/test/test_mser.cpp b/modules/fastcv/test/test_mser.cpp
index ebacbad32f3..29cae5808a7 100644
--- a/modules/fastcv/test/test_mser.cpp
+++ b/modules/fastcv/test/test_mser.cpp
@@ -23,32 +23,32 @@ TEST_P(MSERTest, accuracy)
 
     cv::Mat src = imread(cvtest::findDataFile(imgPath), cv::IMREAD_GRAYSCALE);
 
-    unsigned int delta = 2;
-    unsigned int minArea = 256;
-    unsigned int maxArea = (int)src.total()/4;
+    uint32_t delta = 2;
+    uint32_t minArea = 256;
+    uint32_t maxArea = (int)src.total()/4;
     float        maxVariation = 0.15f;
     float        minDiversity = 0.2f;
 
     std::vector<std::vector<Point>> contours;
     std::vector<cv::Rect> bboxes;
-    std::vector<cv::fastcv::ContourData> contourData;
+    std::vector<cv::fastcv::FCVMSER::ContourData> contourData;
+    cv::Ptr<cv::fastcv::FCVMSER> mser;
+    mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
+                                    maxVariation, minDiversity);
     if (useBboxes)
     {
         if (useContourData)
         {
-            cv::fastcv::MSER(src, contours, bboxes, contourData, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours, bboxes, contourData);
         }
         else
         {
-            cv::fastcv::MSER(src, contours, bboxes, numNeighbors,
-                             delta, minArea, maxArea, maxVariation, minDiversity);
+            mser->detect(src, contours, bboxes);
         }
     }
     else
     {
-        cv::fastcv::MSER(src, contours, numNeighbors,
-                         delta, minArea, maxArea, maxVariation, minDiversity);
+        mser->detect(src, contours);
     }
 
     Rect imgRect(0, 0, src.cols, src.rows);
diff --git a/modules/fastcv/test/test_precomp.hpp b/modules/fastcv/test/test_precomp.hpp
index 1b4c23eca30..7ff8ed78049 100644
--- a/modules/fastcv/test/test_precomp.hpp
+++ b/modules/fastcv/test/test_precomp.hpp
@@ -4,6 +4,7 @@
 */
 
 #include <opencv2/ts.hpp>
+#include <opencv2/core/affine.hpp>
 #include <opencv2/features2d.hpp>
 #include <opencv2/video.hpp>
 
diff --git a/modules/fastcv/test/test_pyramid.cpp b/modules/fastcv/test/test_pyramid.cpp
new file mode 100644
index 00000000000..29acf9ab9a7
--- /dev/null
+++ b/modules/fastcv/test/test_pyramid.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<bool /*useFloat*/, int /*nLevels*/, bool /*scaleBy2*/> PyramidTestParams;
+class PyramidTest : public ::testing::TestWithParam<PyramidTestParams> { };
+
+TEST_P(PyramidTest, accuracy)
+{
+    auto par = GetParam();
+
+    bool useFloat = std::get<0>(par);
+    int  nLevels  = std::get<1>(par);
+    bool scaleBy2 = std::get<2>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    if (useFloat)
+    {
+        cv::Mat f;
+        src.convertTo(f, CV_32F);
+        src = f;
+    }
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels, scaleBy2);
+
+    ASSERT_EQ(pyr.size(), (size_t)nLevels);
+
+    std::vector<cv::Mat> refPyr;
+    if (scaleBy2)
+    {
+        cv::buildPyramid(src, refPyr, nLevels - 1);
+    }
+    else // ORB downscaling
+    {
+        for (int i = 0; i < nLevels; i++)
+        {
+            // we don't know how exactly the bit-accurate size is calculated
+            cv::Mat level;
+            cv::resize(src, level, pyr[i].size(), 0, 0, cv::INTER_AREA);
+            refPyr.push_back(level);
+        }
+    }
+
+    for (int i = 0; i < nLevels; i++)
+    {
+        cv::Mat ref = refPyr[i];
+        cv::Mat m = pyr[i];
+        ASSERT_EQ(m.size(), ref.size());
+        double l2diff   = cv::norm(m, ref, cv::NORM_L2);
+        double linfdiff = cv::norm(m, ref, cv::NORM_INF);
+
+        double l2Thresh   = scaleBy2 ? 178.0 : 5216.0;
+        double linfThresh = scaleBy2 ?  16.0 :  116.0;
+        EXPECT_LE(l2diff,   l2Thresh);
+        EXPECT_LE(linfdiff, linfThresh);
+    }
+
+    if (cvtest::debugLevel > 0)
+    {
+        for (int i = 0; i < nLevels; i++)
+        {
+            char tchar = useFloat ? 'f' : 'i';
+            std::string scaleStr = scaleBy2 ? "x2" : "xORB";
+            cv::imwrite(cv::format("pyr_diff_%c_%d_%s_l%d.png", tchar, nLevels, scaleStr.c_str(), i), cv::abs(pyr[i] - refPyr[i]));
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, PyramidTest,
+                        // useFloat, nLevels, scaleBy2
+                        ::testing::Values(
+                            PyramidTestParams { true, 2,  true}, PyramidTestParams { true, 3,  true}, PyramidTestParams { true, 4,  true},
+                            PyramidTestParams {false, 2,  true}, PyramidTestParams {false, 3,  true}, PyramidTestParams {false, 4,  true},
+                            PyramidTestParams {false, 2, false}, PyramidTestParams {false, 3, false}, PyramidTestParams {false, 4, false}
+                            ));
+
+typedef std::tuple<MatType, size_t> SobelPyramidTestParams;
+class SobelPyramidTest : public ::testing::TestWithParam<SobelPyramidTestParams> {};
+
+TEST_P(SobelPyramidTest, accuracy)
+{
+    auto p = GetParam();
+    int    type    = std::get<0>(p);
+    size_t nLevels = std::get<1>(p);
+
+    // NOTE: test files should be manually loaded to folder on a device, for example like this:
+    // adb push fastcv/misc/bilateral_recursive/ /sdcard/testdata/fastcv/bilateral/
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    std::vector<cv::Mat> pyr;
+    cv::fastcv::buildPyramid(src, pyr, nLevels);
+
+    std::vector<cv::Mat> pyrDx, pyrDy;
+    cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type);
+
+    ASSERT_EQ(pyrDx.size(), nLevels);
+    ASSERT_EQ(pyrDy.size(), nLevels);
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        ASSERT_EQ(pyrDx[i].type(), type);
+        ASSERT_EQ(pyrDx[i].size(), pyr[i].size());
+        ASSERT_EQ(pyrDy[i].type(), type);
+        ASSERT_EQ(pyrDy[i].size(), pyr[i].size());
+    }
+
+    std::vector<cv::Mat> refPyrDx(nLevels), refPyrDy(nLevels);
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        int stype = (type == CV_8S) ? CV_16S : type;
+        cv::Mat dx, dy;
+        cv::Sobel(pyr[i], dx, stype, 1, 0);
+        cv::Sobel(pyr[i], dy, stype, 0, 1);
+        dx.convertTo(refPyrDx[i], type, 1.0/8.0, 0.0);
+        dy.convertTo(refPyrDy[i], type, 1.0/8.0, 0.0);
+    }
+
+    for (size_t i = 0; i < nLevels; i++)
+    {
+        cv::Mat ref, dst;
+        double normInf, normL2;
+        ref = refPyrDx[i];
+        dst = pyrDx[i];
+        normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+        normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
+
+        EXPECT_LE(normInf, 76.1);
+        EXPECT_LT(normL2,   0.4);
+
+        ref = refPyrDy[i];
+        dst = pyrDy[i];
+        normInf = cvtest::norm(dst, ref, cv::NORM_INF);
+        normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
+
+        EXPECT_LE(normInf, 66.6);
+        EXPECT_LT(normL2,   0.4);
+    }
+
+    if (cvtest::debugLevel > 0)
+    {
+        std::map<int, std::string> typeToString =
+        {
+            {CV_8U,   "8u"}, {CV_8S,   "8s"}, {CV_16U, "16u"}, {CV_16S, "16s"},
+            {CV_32S, "32s"}, {CV_32F, "32f"}, {CV_64F, "64f"}, {CV_16F, "16f"},
+        };
+
+        for (size_t i = 0; i < nLevels; i++)
+        {
+            cv::imwrite(cv::format("pyr_l%zu.png", i), pyr[i]);
+            cv::imwrite(cv::format("pyr_sobel_x_t%s_l%zu.png", typeToString.at(type).c_str(), i), pyrDx[i] + 128);
+            cv::imwrite(cv::format("pyr_sobel_y_t%s_l%zu.png", typeToString.at(type).c_str(), i), pyrDy[i] + 128);
+
+            cv::imwrite(cv::format("ref_pyr_sobel_x_t%s_l%zu.png", typeToString.at(type).c_str(), i), refPyrDx[i] + 128);
+            cv::imwrite(cv::format("ref_pyr_sobel_y_t%s_l%zu.png", typeToString.at(type).c_str(), i), refPyrDy[i] + 128);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, SobelPyramidTest, ::testing::Combine(
+    ::testing::Values(CV_8S, CV_16S, CV_32F), // depth
+    ::testing::Values(3, 6))); // nLevels
+
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_remap.cpp b/modules/fastcv/test/test_remap.cpp
index 6fa5ccdabfd..28501534a5d 100644
--- a/modules/fastcv/test/test_remap.cpp
+++ b/modules/fastcv/test/test_remap.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/remap.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -77,12 +76,8 @@ TEST_P(RemapTest, accuracy)
     cv::Mat remapOpenCV;
     cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
 
-    cv::Mat diffImage;
-    cv::absdiff(dst, remapOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(dst, remapOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -105,12 +100,8 @@ TEST_P(RemapTestRGBA, accuracy)
     cv::Mat remapOpenCV;
     cv::remap(src_converted, remapOpenCV, map_x, map_y, interpolation);
 
-    cv::Mat diffImage;
-    cv::absdiff(dst, remapOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(dst, remapOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
diff --git a/modules/fastcv/test/test_scale.cpp b/modules/fastcv/test/test_scale.cpp
index 394fd907cc9..b8e84218ed8 100644
--- a/modules/fastcv/test/test_scale.cpp
+++ b/modules/fastcv/test/test_scale.cpp
@@ -3,8 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
 */
 
-#include "opencv2/ts.hpp"
-#include "opencv2/fastcv/scale.hpp"
+#include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
 
@@ -25,12 +24,8 @@ TEST(resizeDownBy2, accuracy)
     cv::Mat resizedImageOpenCV;
     cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 2, inputImage.rows / 2), 0, 0, INTER_AREA);
 
-    cv::Mat diffImage;
-    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(resized_image, resizedImageOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -50,12 +45,8 @@ TEST(resizeDownBy4, accuracy)
     cv::Mat resizedImageOpenCV;
     cv::resize(inputImage, resizedImageOpenCV, cv::Size(inputImage.cols / 4, inputImage.rows / 4), 0, 0, INTER_AREA);
 
-    cv::Mat diffImage;
-    cv::absdiff(resized_image, resizedImageOpenCV, diffImage);
-
     // Calculate the maximum difference
-    double maxVal=0.0;
-    cv::minMaxLoc(diffImage, nullptr, &maxVal);
+    double maxVal = cv::norm(resized_image, resizedImageOpenCV, cv::NORM_INF);
 
     // Assert if the difference is acceptable (max difference should be less than 10)
     CV_Assert(maxVal < 10 && "Difference between images is too high!");
@@ -79,7 +70,7 @@ TEST_P(ResizeBy2Test, ResizeBy2) {
     EXPECT_EQ(resized_image.size().height, size.height * 0.5);
 }
 
-TEST_P(ResizeBy4Test, ResizeBy2) {
+TEST_P(ResizeBy4Test, ResizeBy4) {
 
     //Size size = get<0>(GetParam());
     Size size = GetParam();
@@ -89,7 +80,7 @@ TEST_P(ResizeBy4Test, ResizeBy2) {
     Size dsize;
     cv::Mat resized_image;
 
-    // Resize the image by a factor of 2
+    // Resize the image by a factor of 4
     cv::fastcv::resizeDownBy4(inputImage, resized_image);
 
     // Check if the output size is correct
@@ -98,14 +89,14 @@ TEST_P(ResizeBy4Test, ResizeBy2) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    ResizeTests, 
-    ResizeBy2Test, 
+    ResizeTests,
+    ResizeBy2Test,
     ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
 ));
 
 INSTANTIATE_TEST_CASE_P(
-    ResizeTests, 
-    ResizeBy4Test, 
+    ResizeTests,
+    ResizeBy4Test,
     ::testing::Values(cv::Size(640, 480), cv::Size(1280, 720), cv::Size(1920, 1080)
 ));
 
diff --git a/modules/fastcv/test/test_tracking.cpp b/modules/fastcv/test/test_tracking.cpp
new file mode 100644
index 00000000000..7833c71b1ec
--- /dev/null
+++ b/modules/fastcv/test/test_tracking.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<int /*winSize*/, bool /*useSobelPyramid*/, bool /*useFastCvPyramids*/, bool /*useInitialEstimate*/ > TrackingTestParams;
+class TrackingTest : public ::testing::TestWithParam<TrackingTestParams> {};
+
+TEST_P(TrackingTest, accuracy)
+{
+    auto par = GetParam();
+
+    int winSz               = std::get<0>(par);
+    bool useSobelPyramid    = std::get<1>(par);
+    bool useFastCvPyramids  = std::get<2>(par);
+    bool useInitialEstimate = std::get<3>(par);
+
+    cv::Mat src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    double ang = 5.0 * CV_PI / 180.0;
+    cv::Matx33d tr = {
+        cos(ang), -sin(ang), 1,
+        sin(ang),  cos(ang), 2,
+               0,         0, 1
+    };
+    cv::Matx33d orig {
+        1, 0, -(double)src.cols / 2,
+        0, 1, -(double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx33d back {
+        1, 0, (double)src.cols / 2,
+        0, 1, (double)src.rows / 2,
+        0, 0, 1
+    };
+    cv::Matx23d trans = (back * tr * orig).get_minor<2, 3>(0, 0);
+
+    cv::Mat dst;
+    cv::warpAffine(src, dst, trans, src.size());
+
+    int nLevels = 4;
+    std::vector<cv::Mat> srcPyr, dstPyr;
+
+    if (useFastCvPyramids)
+    {
+        cv::fastcv::buildPyramid(src, srcPyr, nLevels);
+        cv::fastcv::buildPyramid(dst, dstPyr, nLevels);
+    }
+    else
+    {
+        cv::buildPyramid(src, srcPyr, nLevels - 1);
+        cv::buildPyramid(dst, dstPyr, nLevels - 1);
+    }
+
+    cv::Matx23f transf = trans;
+    int nPts = 32;
+    std::vector<cv::Point2f> ptsIn, ptsOut, ptsEst, ptsExpected;
+    for (int i = 0; i < nPts; i++)
+    {
+        cv::Point2f p { (((float)cv::theRNG())*0.5f + 0.25f) * src.cols,
+                        (((float)cv::theRNG())*0.5f + 0.25f) * src.rows };
+        ptsIn.push_back(p);
+        ptsExpected.push_back(transf * cv::Vec3f(p.x, p.y, 1.0));
+        ptsOut.push_back({ });
+        ptsEst.push_back(p);
+    }
+
+    std::vector<int32_t> statusVec(nPts);
+
+    cv::TermCriteria termCrit;
+    termCrit.type = cv::TermCriteria::COUNT | cv::TermCriteria::EPS;
+    termCrit.maxCount = 7;
+    termCrit.epsilon = 0.03f * 0.03f;
+
+    if (useSobelPyramid)
+    {
+        std::vector<cv::Mat> srcDxPyr, srcDyPyr;
+        cv::fastcv::sobelPyramid(srcPyr, srcDxPyr, srcDyPyr, CV_8S);
+        cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, srcDxPyr, srcDyPyr,
+                                       ptsIn, ptsOut, statusVec, {winSz, winSz});
+    }
+    else
+    {
+        cv::fastcv::trackOpticalFlowLK(src, dst, srcPyr, dstPyr, ptsIn, ptsOut, (useInitialEstimate ? ptsEst : noArray()),
+                                        statusVec, {winSz, winSz}, termCrit);
+    }
+
+    std::vector<cv::Point2f> ocvPtsOut;
+    std::vector<uint8_t> ocvStatusVec;
+    std::vector<float> ocvErrVec;
+    cv::calcOpticalFlowPyrLK(src, dst, ptsIn, ocvPtsOut, ocvStatusVec, ocvErrVec, {winSz, winSz}, nLevels - 1, termCrit);
+
+    cv::Mat refStatusVec(nPts, 1, CV_32S, Scalar::all(1));
+    cv::Mat ocvStatusVecInt;
+    cv::Mat(ocvStatusVec).convertTo(ocvStatusVecInt, CV_32S);
+
+    double statusNormOcv = cv::norm(ocvStatusVecInt, refStatusVec, NORM_INF);
+    double statusNorm = cv::norm(cv::Mat(statusVec), refStatusVec, NORM_INF);
+
+    EXPECT_EQ(statusNormOcv, 0);
+    EXPECT_EQ(statusNorm, 0);
+
+    double diffNormOcv = cv::norm(ocvPtsOut, ptsExpected, NORM_L2);
+    double diffNorm = cv::norm(ptsOut, ptsExpected, NORM_L2);
+
+    EXPECT_LT(diffNormOcv, 31.92);
+    EXPECT_LT(diffNorm, 6.69);
+
+    if (cvtest::debugLevel > 0)
+    {
+        auto drawPts = [ptsIn, dst](const std::vector<cv::Point2f>& ptsRes, const std::string fname)
+        {
+            cv::Mat draw = dst.clone();
+            for (size_t i = 0; i < ptsIn.size(); i++)
+            {
+                cv::line(draw, ptsIn[i], ptsRes[i], Scalar::all(255));
+                cv::circle(draw, ptsIn[i], 1, Scalar::all(255));
+                cv::circle(draw, ptsRes[i], 3, Scalar::all(255));
+            }
+            cv::imwrite(fname, draw);
+        };
+
+        drawPts(ptsOut, "track_w"+std::to_string(winSz)+"_warped.png");
+        drawPts(ocvPtsOut, "track_ocv_warped.png");
+
+        std::cout << "status vec:"   << std::endl << cv::Mat(statusVec).t()   << std::endl;
+        std::cout << "status vec ocv:" << std::endl << cv::Mat(ocvStatusVec).t() << std::endl;
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, TrackingTest,
+                        ::testing::Combine(::testing::Values(5, 7, 9), // window size
+                                           ::testing::Bool(),          // useSobelPyramid
+                                           ::testing::Bool(),          // useFastCvPyramids
+                                           ::testing::Bool()           // useInitialEstimate
+                        ));
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_warp.cpp b/modules/fastcv/test/test_warp.cpp
new file mode 100644
index 00000000000..240262f93ca
--- /dev/null
+++ b/modules/fastcv/test/test_warp.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef testing::TestWithParam<cv::Size> WarpPerspective2Plane;
+
+TEST_P(WarpPerspective2Plane, accuracy)
+{
+    cv::Size dstSize = GetParam();
+    cv::Mat img = imread(cvtest::findDataFile("cv/shared/baboon.png"));
+    Mat src(img.rows, img.cols, CV_8UC1);
+    cvtColor(img,src,cv::COLOR_BGR2GRAY);
+    cv::Mat dst1, dst2, mat, ref1, ref2;
+    mat.create(3,3,CV_32FC1);
+    dst1.create(dstSize,CV_8UC1);
+    dst2.create(dstSize,CV_8UC1);
+
+    RNG rng = RNG((uint64)-1);
+    Point2f s[4], d[4];
+
+    s[0] = Point2f(0,0);
+    d[0] = Point2f(0,0);
+    s[1] = Point2f(src.cols-1.f,0);
+    d[1] = Point2f(dst1.cols-1.f,0);
+    s[2] = Point2f(src.cols-1.f,src.rows-1.f);
+    d[2] = Point2f(dst1.cols-1.f,dst1.rows-1.f);
+    s[3] = Point2f(0,src.rows-1.f);
+    d[3] = Point2f(0,dst1.rows-1.f);
+
+    float buffer[16];
+    Mat tmp( 1, 16, CV_32FC1, buffer );
+    rng.fill( tmp, 1, Scalar::all(0.), Scalar::all(0.1) );
+
+    for(int i = 0; i < 4; i++ )
+    {
+        s[i].x += buffer[i*4]*src.cols/2;
+        s[i].y += buffer[i*4+1]*src.rows/2;
+        d[i].x += buffer[i*4+2]*dst1.cols/2;
+        d[i].y += buffer[i*4+3]*dst1.rows/2;
+    }
+
+    cv::getPerspectiveTransform( s, d ).convertTo( mat, mat.depth() );
+    // Invert the perspective matrix
+    invert(mat,mat);
+
+    cv::fastcv::warpPerspective2Plane(src, src, dst1, dst2, mat, dstSize);
+    cv::warpPerspective(src,ref1,mat,dstSize,(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP));
+    cv::warpPerspective(src,ref2,mat,dstSize,(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP));
+
+    cv::Mat difference1, difference2, mask1,mask2;
+    cv::absdiff(dst1, ref1, difference1);
+    cv::absdiff(dst2, ref2, difference2);
+    cv::threshold(difference1, mask1, 5, 255, cv::THRESH_BINARY);
+    cv::threshold(difference2, mask2, 5, 255, cv::THRESH_BINARY);
+    int num_diff_pixels_1 = cv::countNonZero(mask1);
+    int num_diff_pixels_2 = cv::countNonZero(mask2);
+
+    EXPECT_LT(num_diff_pixels_1, src.size().area()*0.02);
+    EXPECT_LT(num_diff_pixels_2, src.size().area()*0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, WarpPerspective2Plane, Values(perf::szVGA, perf::sz720p, perf::sz1080p));
+
+}
+}
\ No newline at end of file

From a00b3f329660f28a4775cd870e30d73c98e01c79 Mon Sep 17 00:00:00 2001
From: adsha-quic <quic_adsha@quicinc.com>
Date: Mon, 23 Dec 2024 12:41:45 +0530
Subject: [PATCH 09/14] Merge pull request #3845 from CodeLinaro:adsha_2ndPost

FastCV Extension code for OpenCV 2ndpost-2 #3845

### Description:
- Add support for cv::fastcv::calcHist

Depends on: [opencv/opencv_contrib#3844](https://github.com/opencv/opencv_contrib/pull/3844)
Depends on: [opencv/opencv#26619](https://github.com/opencv/opencv/pull/26619)
Requires binary from: [opencv/opencv_3rdparty#90](https://github.com/opencv/opencv_3rdparty/pull/90)

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/fastcv/include/opencv2/fastcv.hpp     |  1 +
 .../include/opencv2/fastcv/histogram.hpp      | 29 +++++++
 .../fastcv/include/opencv2/fastcv/moments.hpp |  3 +-
 modules/fastcv/perf/perf_bilateral.cpp        |  2 +-
 modules/fastcv/perf/perf_histogram.cpp        | 36 +++++++++
 modules/fastcv/src/bilateralFilter.cpp        | 30 +++----
 modules/fastcv/src/histogram.cpp              | 74 +++++++++++++++++
 modules/fastcv/src/moments.cpp                | 80 +++++++++----------
 modules/fastcv/test/test_bilateral.cpp        |  4 +-
 modules/fastcv/test/test_moments.cpp          | 39 +++++----
 10 files changed, 217 insertions(+), 81 deletions(-)
 create mode 100644 modules/fastcv/include/opencv2/fastcv/histogram.hpp
 create mode 100644 modules/fastcv/perf/perf_histogram.cpp
 create mode 100644 modules/fastcv/src/histogram.cpp

diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
index 6ed8eba4a33..af188dfcb09 100644
--- a/modules/fastcv/include/opencv2/fastcv.hpp
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -16,6 +16,7 @@
 #include "opencv2/fastcv/edges.hpp"
 #include "opencv2/fastcv/fast10.hpp"
 #include "opencv2/fastcv/fft.hpp"
+#include "opencv2/fastcv/histogram.hpp"
 #include "opencv2/fastcv/hough.hpp"
 #include "opencv2/fastcv/ipptransform.hpp"
 #include "opencv2/fastcv/moments.hpp"
diff --git a/modules/fastcv/include/opencv2/fastcv/histogram.hpp b/modules/fastcv/include/opencv2/fastcv/histogram.hpp
new file mode 100644
index 00000000000..f0bbd3c6f61
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/histogram.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_HISTOGRAM_HPP
+#define OPENCV_FASTCV_HISTOGRAM_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Calculates histogram of input image. This function implements specific use case of
+ *        256-bin histogram calculation for 8u single channel images in an optimized way.
+ * @param _src Intput image with type CV_8UC1
+ * @param _hist Output histogram of type int of 256 bins
+ */
+CV_EXPORTS_W void calcHist( InputArray _src, OutputArray _hist );
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_HISTOGRAM_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/moments.hpp b/modules/fastcv/include/opencv2/fastcv/moments.hpp
index 90034548571..13c9019841f 100644
--- a/modules/fastcv/include/opencv2/fastcv/moments.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/moments.hpp
@@ -16,7 +16,8 @@ namespace fastcv {
 
 /**
  * @brief Calculates all of the moments up to the third order of the image pixels' intensities
-          The results are returned in the structure cv::Moments.
+ *         The results are returned in the structure cv::Moments. This function cv::fastcv::moments()
+ *         calculate the moments using floating point calculations whereas cv::moments() calculate moments using double.
  * @param _src      Input image with type CV_8UC1, CV_32SC1, CV_32FC1
  * @param binary    If true, assumes the image to be binary (0x00 for black, 0xff for white), otherwise assumes the image to be
  *                  grayscale.
diff --git a/modules/fastcv/perf/perf_bilateral.cpp b/modules/fastcv/perf/perf_bilateral.cpp
index 63323d459cc..bfeb50f288c 100644
--- a/modules/fastcv/perf/perf_bilateral.cpp
+++ b/modules/fastcv/perf/perf_bilateral.cpp
@@ -52,7 +52,7 @@ PERF_TEST_P(BilateralPerfTest, run,
 
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
-    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
     Mat dst;
 
     while (next())
diff --git a/modules/fastcv/perf/perf_histogram.cpp b/modules/fastcv/perf/perf_histogram.cpp
new file mode 100644
index 00000000000..60bfa51a19a
--- /dev/null
+++ b/modules/fastcv/perf/perf_histogram.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef std::tuple<cv::Size> HistogramPerfParams;
+typedef perf::TestBaseWithParam<HistogramPerfParams> HistogramPerfTest;
+
+
+PERF_TEST_P(HistogramPerfTest, run,
+         testing::Values(perf::szQVGA, perf::szVGA, perf::sz720p, perf::sz1080p)
+    )
+{
+    auto p = GetParam();
+    cv::Size size  = std::get<0>(p);
+
+    RNG& rng = cv::theRNG();
+    Mat src(size, CV_8UC1);
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+    Mat hist(1, 256, CV_32SC1);
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::calcHist(src, hist);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
diff --git a/modules/fastcv/src/bilateralFilter.cpp b/modules/fastcv/src/bilateralFilter.cpp
index a0995347b24..79e6b1067bc 100644
--- a/modules/fastcv/src/bilateralFilter.cpp
+++ b/modules/fastcv/src/bilateralFilter.cpp
@@ -22,13 +22,13 @@ class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
     {
         int height_ = range.end - range.start;
         int width_  = width;
-		cv::Mat src_;
-		int n = knl/2;
+        cv::Mat src_;
+        int n = knl/2;
 
         src_ = cv::Mat(height_ + 2 * n, width_ + 2 * n, CV_8U);
         if (range.start == 0 && range.end == height)
         {
-            cv::copyMakeBorder(src, src_, n, n, n, n, bdr);
+            cv::copyMakeBorder(src(cv::Rect(0, 0, width, height)), src_, n, n, n, n, bdr);
         }
         else if (range.start == 0)
         {
@@ -43,7 +43,7 @@ class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
             cv::copyMakeBorder(src(cv::Rect(0, range.start - n, width_, height_ + 2 * n)), src_, 0, 0, n, n, bdr);
         }
 
-		cv::Mat dst_padded = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
+        cv::Mat dst_padded = cv::Mat(height_ + 2*n, width_ + 2*n, CV_8U);
 
         auto func = (knl == 5) ? fcvBilateralFilter5x5u8_v3 :
                     (knl == 7) ? fcvBilateralFilter7x7u8_v3 :
@@ -52,10 +52,10 @@ class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
         func(src_.data, width_ + 2 * n, height_ + 2 * n, width_ + 2 * n,
              dst_padded.data, width_ + 2 * n, sigma_color, sigma_space, 0);
 
-		cv::Mat dst_temp1 = dst_padded(cv::Rect(n, n, width_, height_));
-		cv::Mat dst_temp2 = dst(cv::Rect(0, range.start, width_, height_));
-		dst_temp1.copyTo(dst_temp2);
-	}
+        cv::Mat dst_temp1 = dst_padded(cv::Rect(n, n, width_, height_));
+        cv::Mat dst_temp2 = dst(cv::Rect(0, range.start, width_, height_));
+        dst_temp1.copyTo(dst_temp2);
+    }
 
 private:
     const size_t src_step;
@@ -67,8 +67,8 @@ class FcvFilterLoop_Invoker : public cv::ParallelLoopBody
     float32_t sigma_color;
     float32_t sigma_space;
     int ret;
-	cv::Mat src;
-	cv::Mat dst;
+    cv::Mat src;
+    cv::Mat dst;
 
     FcvFilterLoop_Invoker(const FcvFilterLoop_Invoker &);  // = delete;
     const FcvFilterLoop_Invoker& operator= (const FcvFilterLoop_Invoker &);  // = delete;
@@ -82,24 +82,20 @@ void bilateralFilter( InputArray _src, OutputArray _dst, int d,
 
     CV_Assert(!_src.empty());
     int type = _src.type();
-	CV_Assert(type == CV_8UC1);
-	CV_Assert(d == 5 || d == 7 || d == 9);
+    CV_Assert(type == CV_8UC1);
+    CV_Assert(d == 5 || d == 7 || d == 9);
 
     Size size = _src.size();
-	_dst.create( size, type );
+    _dst.create( size, type );
     Mat src = _src.getMat();
     Mat dst = _dst.getMat();
 
     CV_Assert(src.data != dst.data);
 
     if( sigmaColor <= 0 )
-	{
         sigmaColor = 1;
-	}
     if( sigmaSpace <= 0 )
-	{
         sigmaSpace = 1;
-	}
 
     int nStripes = (src.rows / 20 == 0) ? 1 : (src.rows / 20);
     cv::parallel_for_(cv::Range(0, src.rows),
diff --git a/modules/fastcv/src/histogram.cpp b/modules/fastcv/src/histogram.cpp
new file mode 100644
index 00000000000..1a88ea8eb1c
--- /dev/null
+++ b/modules/fastcv/src/histogram.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+class FcvHistogramLoop_Invoker : public cv::ParallelLoopBody
+{
+public:
+
+    FcvHistogramLoop_Invoker(const uchar * src_data_, size_t src_step_, int width_, int height_, int32_t* gl_hist_, int stripeHeight_, cv::Mutex* histogramLock, int nStripes_):
+        cv::ParallelLoopBody(), src_data(src_data_), src_step(src_step_), width(width_), height(height_), gl_hist(gl_hist_), stripeHeight(stripeHeight_), histogramLock_(histogramLock), nStripes(nStripes_)
+    {
+    }
+
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        int height_ = stripeHeight;
+        if(range.end == nStripes)
+           height_ += (height % nStripes);
+        const uchar* yS = src_data;
+        int32_t l_hist[256] = {0};
+        fcvImageIntensityHistogram(yS, src_step, 0, range.start, width, height_, l_hist);
+        cv::AutoLock lock(*histogramLock_);
+
+        for( int i = 0; i < 256; i++ )
+            gl_hist[i] += l_hist[i];
+    }
+
+private:
+    const uchar * src_data;
+    const size_t src_step;
+    const int width;
+    const int height;
+    int32_t *gl_hist;
+    int ret;
+    int stripeHeight;
+    cv::Mutex* histogramLock_;
+    int nStripes;
+
+    FcvHistogramLoop_Invoker(const FcvHistogramLoop_Invoker &);  // = delete;
+    const FcvHistogramLoop_Invoker& operator= (const FcvHistogramLoop_Invoker &);  // = delete;
+};
+
+void calcHist( InputArray _src, OutputArray _hist )
+{
+    INITIALIZATION_CHECK;
+
+    CV_Assert(!_src.empty());
+    int type = _src.type();
+    CV_Assert(type == CV_8UC1);
+
+    _hist.create( cv::Size(256, 1), CV_32SC1 );
+    Mat src = _src.getMat();
+    Mat hist = _hist.getMat();
+
+    for( int i = 0; i < 256; i++ )
+       hist.ptr<int>()[i] = 0;
+
+    cv::Mutex histogramLockInstance;
+
+    int nStripes = cv::getNumThreads();
+    int stripeHeight = src.rows / nStripes;
+
+    cv::parallel_for_(cv::Range(0, nStripes),
+              FcvHistogramLoop_Invoker(src.data, src.step[0], src.cols, src.rows, hist.ptr<int>(), stripeHeight, &histogramLockInstance, nStripes), nStripes);
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/src/moments.cpp b/modules/fastcv/src/moments.cpp
index 3a0c4249eef..38bae771df3 100644
--- a/modules/fastcv/src/moments.cpp
+++ b/modules/fastcv/src/moments.cpp
@@ -10,71 +10,63 @@ namespace fastcv {
 
 cv::Moments moments(InputArray _src, bool binary)
 {
-	INITIALIZATION_CHECK;
+    INITIALIZATION_CHECK;
 
     CV_Assert(!_src.empty());
     int type = _src.type();
-	CV_Assert(type == CV_8UC1 || type == CV_32SC1 || type == CV_32FC1);
+    CV_Assert(type == CV_8UC1 || type == CV_32SC1 || type == CV_32FC1);
 
     Size size = _src.size();
     Mat src = _src.getMat();
 
     cv::Moments m;
-	if( size.width == 0 || size.height == 0 )
-        return m;
-
-	fcvMoments* mFCV = new fcvMoments();
+    fcvMoments mFCV;
     fcvStatus status = FASTCV_SUCCESS;
-	if(binary)
+    if(binary)
+    {
+        cv::Mat src_binary(size, CV_8UC1);
+        cv::compare( src, 0, src_binary, cv::CMP_NE );
+        fcvImageMomentsu8(src_binary.data, src_binary.cols,
+                          src_binary.rows, src_binary.step[0], &mFCV, binary);
+    }
+    else
     {
-		cv::Mat src_binary(size, CV_8UC1);
-		cv::compare( src, 0, src_binary, cv::CMP_NE );
-		fcvImageMomentsu8(src_binary.data, src_binary.cols,
-		                  src_binary.rows, src_binary.step, mFCV, binary);
+        switch(type)
+        {
+            case CV_8UC1:
+                fcvImageMomentsu8(src.data, src.cols, src.rows, src.step[0], &mFCV, binary);
+                break;
+            case CV_32SC1:
+                fcvImageMomentss32(src.ptr<int>(), src.cols, src.rows, src.step[0], &mFCV, binary);
+                break;
+            case CV_32FC1:
+                fcvImageMomentsf32(src.ptr<float>(), src.cols, src.rows, src.step[0], &mFCV, binary);
+                break;
+        }
     }
-	else
-	{
-		switch(type)
-		{
-			case CV_8UC1:
-			    fcvImageMomentsu8(src.data, src.cols, src.rows,
-				                  src.step, mFCV, binary);
-				break;
-			case CV_32SC1:
-			    fcvImageMomentss32((const int*)src.data, src.cols, src.rows,
-				                  src.step, mFCV, binary);
-				break;
-			case CV_32FC1:
-			    fcvImageMomentsf32((const float*)src.data, src.cols, src.rows,
-				                  src.step, mFCV, binary);
-				break;
-		}
-	}
 
-	if (status != FASTCV_SUCCESS)
+    if (status != FASTCV_SUCCESS)
     {
         CV_Error( cv::Error::StsError, cv::format("Error occurred!") );
-		delete mFCV;
         return m;
     }
 
-	m.m00  = mFCV->m00;  m.m10  = mFCV->m10;  m.m01  = mFCV->m01;
-	m.m20  = mFCV->m20;  m.m11  = mFCV->m11;  m.m02  = mFCV->m02;
-	m.m30  = mFCV->m30;  m.m21  = mFCV->m21;  m.m12  = mFCV->m12;
-	m.m03  = mFCV->m03;  m.mu02 = mFCV->mu02; m.m03  = mFCV->mu03;
-	m.mu11 = mFCV->mu11; m.mu12 = mFCV->mu12; m.mu20 = mFCV->mu20;
-	m.mu21 = mFCV->mu21; m.mu30 = mFCV->mu30;
+    m.m00  = mFCV.m00;  m.m10  = mFCV.m10;  m.m01  = mFCV.m01;
+    m.m20  = mFCV.m20;  m.m11  = mFCV.m11;  m.m02  = mFCV.m02;
+    m.m30  = mFCV.m30;  m.m21  = mFCV.m21;  m.m12  = mFCV.m12;
+    m.m03  = mFCV.m03;  m.mu02 = mFCV.mu02; m.m03  = mFCV.mu03;
+    m.mu11 = mFCV.mu11; m.mu12 = mFCV.mu12; m.mu20 = mFCV.mu20;
+    m.mu21 = mFCV.mu21; m.mu30 = mFCV.mu30;
 
-	float32_t inv_m00 = 1.0/mFCV->m00;
-	float32_t inv_sqrt_m00 = mFCV->inv_sqrt_m00;
+    float32_t inv_m00 = 1.0/mFCV.m00;
+    float32_t inv_sqrt_m00 = mFCV.inv_sqrt_m00;
     float32_t s2 = inv_m00 * inv_m00, s3 = s2 * inv_sqrt_m00;
 
-    m.nu20 = mFCV->mu20 * s2; m.nu11 = mFCV->mu11 * s2;
-	m.nu02 = mFCV->mu02 * s2; m.nu30 = mFCV->mu30 * s3;
-	m.nu21 = mFCV->mu21 * s3; m.nu12 = mFCV->mu12 * s3;
-	m.nu03 = mFCV->mu03 * s3;
+    m.nu20 = mFCV.mu20 * s2; m.nu11 = mFCV.mu11 * s2;
+    m.nu02 = mFCV.mu02 * s2; m.nu30 = mFCV.mu30 * s3;
+    m.nu21 = mFCV.mu21 * s3; m.nu12 = mFCV.mu12 * s3;
+    m.nu03 = mFCV.mu03 * s3;
 
-    delete mFCV;
     return m;
 }
 
diff --git a/modules/fastcv/test/test_bilateral.cpp b/modules/fastcv/test/test_bilateral.cpp
index 5c883801a92..6ee8e6c409c 100644
--- a/modules/fastcv/test/test_bilateral.cpp
+++ b/modules/fastcv/test/test_bilateral.cpp
@@ -12,13 +12,13 @@ typedef testing::TestWithParam<tuple<cv::Size,int,int>> fcv_bilateralFilterTest;
 TEST_P(fcv_bilateralFilterTest, accuracy)
 {
     cv::Size size  = get<0>(GetParam());
-	int d = get<1>(GetParam());
+    int d = get<1>(GetParam());
     double sigmaColor = get<2>(GetParam());
     double sigmaSpace = sigmaColor;
 
     RNG& rng = cv::theRNG();
     Mat src(size, CV_8UC1);
-    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(256));
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
 
     cv::Mat dst;
 
diff --git a/modules/fastcv/test/test_moments.cpp b/modules/fastcv/test/test_moments.cpp
index d4ef89f98db..13e245f749d 100644
--- a/modules/fastcv/test/test_moments.cpp
+++ b/modules/fastcv/test/test_moments.cpp
@@ -15,22 +15,29 @@ TEST_P(fcv_momentsTest, accuracy)
     const Size srcSize = get<1>(GetParam());
     const MatDepth srcType = get<2>(GetParam());
     Mat src(srcSize, srcType);
-
-	for(int j = 0; j < srcSize.width; ++j)
-        for(int i = 0; i < srcSize.height; ++i)
-		{
-			if(srcType == CV_8UC1)
-				src.at<uchar>(i, j) = cv::randu<uchar>();
-			else if(srcType == CV_32SC1)
-				src.at<int>(i, j) = cv::randu<int>();
-			else if(srcType == CV_32FC1)
-				src.at<float>(i, j) = cv::randu<float>();
-	    }
-
-	cv::Moments m = cv::fastcv::moments(src, binaryImage);
-
-    int len_m = sizeof(m)/sizeof(m.m00);
-    EXPECT_FALSE(len_m != 24);
+    cv::RNG& rng = cv::theRNG();
+    if(srcType == CV_8UC1)
+        rng.fill(src,  cv::RNG::UNIFORM, 0, 5);
+    else if(srcType == CV_32SC1)
+        rng.fill(src, cv::RNG::UNIFORM, 0, 5);
+    else if(srcType == CV_32FC1)
+        rng.fill(src, cv::RNG::UNIFORM, 0.f, 5.f);
+
+    cv::Moments m = cv::fastcv::moments(src, binaryImage);
+
+    cv::Scalar mean_val, stdDev;
+    float mean_val_fcv = m.m00/(srcSize.width * srcSize.height);
+    if(binaryImage)
+    {
+        cv::Mat src_binary(srcSize, CV_8UC1);
+        cv::compare( src, 0, src_binary, cv::CMP_NE );
+        mean_val = cv::mean(src_binary);
+        mean_val_fcv *= 255;
+    }
+    else
+        mean_val = cv::mean(src);
+
+    EXPECT_NEAR(mean_val[0], mean_val_fcv, 2);
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, fcv_momentsTest, Combine(

From 80d468872abd1961875acc8c026bfb7b5f6d8632 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@opencv.ai>
Date: Fri, 20 Dec 2024 14:51:04 +0300
Subject: [PATCH 10/14] Fixed Java and Python bindings generation.

---
 .../fastcv/include/opencv2/fastcv/hough.hpp   | 21 ------
 .../fastcv/include/opencv2/fastcv/mser.hpp    | 30 ++++----
 .../fastcv/include/opencv2/fastcv/thresh.hpp  |  2 +-
 modules/fastcv/perf/perf_mser.cpp             |  2 +-
 modules/fastcv/src/mser.cpp                   | 69 ++++++++++---------
 modules/fastcv/src/thresh.cpp                 |  7 +-
 modules/fastcv/test/test_mser.cpp             |  2 +-
 7 files changed, 59 insertions(+), 74 deletions(-)

diff --git a/modules/fastcv/include/opencv2/fastcv/hough.hpp b/modules/fastcv/include/opencv2/fastcv/hough.hpp
index e43323903cb..5e01576a0f0 100644
--- a/modules/fastcv/include/opencv2/fastcv/hough.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/hough.hpp
@@ -25,27 +25,6 @@ namespace fastcv {
  */
 CV_EXPORTS_W void houghLines(InputArray src, OutputArray lines, double threshold = 0.25);
 
-
-/**
- * @brief Finds circles in a grayscale image using Hough transform.
- *        The radius of circle varies from 0 to max(srcWidth, srcHeight).
- *
- * @param src Input 8-bit image containing binary contour. Step should be divisible by 8, data start should be 128-bit aligned
- * @param circles Output array containing detected circles in a form (x, y, r) where all numbers are 32-bit integers
- * @param minDist Minimum distance between the centers of the detected circles
- * @param cannyThreshold The higher threshold of the two passed to the Canny() edge detector
- *                       (the lower one is twice smaller). Default is 100.
- * @param accThreshold The accumulator threshold for the circle centers at the detection
- *                     stage. The smaller it is, the more false circles may be detected.
- *                     Circles, corresponding to the larger accumulator values, will be
- *                     returned first. Default is 100.
- * @param minRadius Minimum circle radius, default is 0
- * @param maxRadius Maximum circle radius, default is 0
- */
-CV_EXPORTS_W void houghCircles(InputArray src, OutputArray circles, uint32_t minDist,
-                               uint32_t cannyThreshold = 100, uint32_t accThreshold = 100,
-                               uint32_t minRadius = 0, uint32_t maxRadius = 0);
-
 //! @}
 
 } // fastcv::
diff --git a/modules/fastcv/include/opencv2/fastcv/mser.hpp b/modules/fastcv/include/opencv2/fastcv/mser.hpp
index bfa898544f5..249c0e14e2b 100644
--- a/modules/fastcv/include/opencv2/fastcv/mser.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/mser.hpp
@@ -52,13 +52,13 @@ class CV_EXPORTS_W FCVMSER
                            Typical value range [0.1 1.0], typical value 0.2
      * @return Feature detector object ready for detection
      */
-    CV_WRAP static Ptr<FCVMSER> create( cv::Size     imgSize,
-                                        uint32_t numNeighbors = 4,
-                                        uint32_t delta = 2,
-                                        uint32_t minArea = 30,
-                                        uint32_t maxArea = 14400,
-                                        float        maxVariation = 0.15f,
-                                        float        minDiversity = 0.2f);
+    CV_WRAP static Ptr<FCVMSER> create( const cv::Size& imgSize,
+                                        int numNeighbors = 4,
+                                        int delta = 2,
+                                        int minArea = 30,
+                                        int maxArea = 14400,
+                                        float maxVariation = 0.15f,
+                                        float minDiversity = 0.2f);
 
     /**
      * @brief This is an overload for detect() function
@@ -95,15 +95,15 @@ class CV_EXPORTS_W FCVMSER
     * @param contourData Array containing additional information about found contours
     */
     virtual void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes,
-                                std::vector<ContourData>& contourData) = 0;
+                        std::vector<ContourData>& contourData) = 0;
 
-    CV_WRAP virtual cv::Size     getImgSize()      = 0;
-    CV_WRAP virtual uint32_t getNumNeighbors() = 0;
-    CV_WRAP virtual uint32_t getDelta()        = 0;
-    CV_WRAP virtual uint32_t getMinArea()      = 0;
-    CV_WRAP virtual uint32_t getMaxArea()      = 0;
-    CV_WRAP virtual float        getMaxVariation() = 0;
-    CV_WRAP virtual float        getMinDiversity() = 0;
+    CV_WRAP virtual cv::Size getImgSize() = 0;
+    CV_WRAP virtual int getNumNeighbors() = 0;
+    CV_WRAP virtual int getDelta()        = 0;
+    CV_WRAP virtual int getMinArea()      = 0;
+    CV_WRAP virtual int getMaxArea()      = 0;
+    CV_WRAP virtual float getMaxVariation() = 0;
+    CV_WRAP virtual float getMinDiversity() = 0;
 
     virtual ~FCVMSER() {}
 };
diff --git a/modules/fastcv/include/opencv2/fastcv/thresh.hpp b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
index 418f98a012d..ab6a847dcb9 100644
--- a/modules/fastcv/include/opencv2/fastcv/thresh.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/thresh.hpp
@@ -27,7 +27,7 @@ namespace fastcv {
  * @param falseValue The value assigned to the destination pixel if the source is out of the range inclusively defined by the
  *                   pair of threshold values
  */
-CV_EXPORTS_W void thresholdRange(InputArray src, OutputArray dst, uint8_t lowThresh, uint8_t highThresh, uint8_t trueValue, uint8_t falseValue);
+CV_EXPORTS_W void thresholdRange(InputArray src, OutputArray dst, int lowThresh, int highThresh, int trueValue, int falseValue);
 
 //! @}
 
diff --git a/modules/fastcv/perf/perf_mser.cpp b/modules/fastcv/perf/perf_mser.cpp
index 7232cd47cb4..36f876cd045 100644
--- a/modules/fastcv/perf/perf_mser.cpp
+++ b/modules/fastcv/perf/perf_mser.cpp
@@ -38,7 +38,7 @@ PERF_TEST_P(MSERPerfTest, run,
 
     cv::Ptr<cv::fastcv::FCVMSER> mser;
     mser = cv::fastcv::FCVMSER::create(src.size(), numNeighbors, delta, minArea, maxArea,
-                                    maxVariation, minDiversity);
+                                       maxVariation, minDiversity);
 
     while(next())
     {
diff --git a/modules/fastcv/src/mser.cpp b/modules/fastcv/src/mser.cpp
index 6919099a482..a44cecae073 100644
--- a/modules/fastcv/src/mser.cpp
+++ b/modules/fastcv/src/mser.cpp
@@ -12,22 +12,22 @@ class MSER_Impl CV_FINAL : public cv::fastcv::FCVMSER
 {
 public:
     explicit MSER_Impl(cv::Size     imgSize,
-                       uint32_t numNeighbors,
-                       uint32_t delta,
-                       uint32_t minArea,
-                       uint32_t maxArea,
-                       float        maxVariation,
-                       float        minDiversity);
+                       int numNeighbors,
+                       int delta,
+                       int minArea,
+                       int maxArea,
+                       float maxVariation,
+                       float minDiversity);
 
     ~MSER_Impl() CV_OVERRIDE;
 
-    cv::Size     getImgSize()      CV_OVERRIDE { return imgSize;      };
-    uint32_t getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
-    uint32_t getDelta()        CV_OVERRIDE { return delta;        };
-    uint32_t getMinArea()      CV_OVERRIDE { return minArea;      };
-    uint32_t getMaxArea()      CV_OVERRIDE { return maxArea;      };
-    float        getMaxVariation() CV_OVERRIDE { return maxVariation; };
-    float        getMinDiversity() CV_OVERRIDE { return minDiversity; };
+    cv::Size getImgSize()      CV_OVERRIDE { return imgSize;      };
+    int getNumNeighbors() CV_OVERRIDE { return numNeighbors; };
+    int getDelta()        CV_OVERRIDE { return delta;        };
+    int getMinArea()      CV_OVERRIDE { return minArea;      };
+    int getMaxArea()      CV_OVERRIDE { return maxArea;      };
+    float getMaxVariation() CV_OVERRIDE { return maxVariation; };
+    float getMinDiversity() CV_OVERRIDE { return minDiversity; };
 
     void detect(InputArray src, std::vector<std::vector<Point>>& contours) CV_OVERRIDE;
     void detect(InputArray src, std::vector<std::vector<Point>>& contours, std::vector<cv::Rect>& boundingBoxes) CV_OVERRIDE;
@@ -42,24 +42,24 @@ class MSER_Impl CV_FINAL : public cv::fastcv::FCVMSER
                        bool useContourData = true);
 
     cv::Size imgSize;
-    uint32_t numNeighbors;
-    uint32_t delta;
-    uint32_t minArea;
-    uint32_t maxArea;
-    float        maxVariation;
-    float        minDiversity;
+    int numNeighbors;
+    int delta;
+    int minArea;
+    int maxArea;
+    float maxVariation;
+    float minDiversity;
 
     void *mserHandle;
 };
 
 
-MSER_Impl::MSER_Impl(cv::Size     _imgSize,
-                     uint32_t _numNeighbors,
-                     uint32_t _delta,
-                     uint32_t _minArea,
-                     uint32_t _maxArea,
-                     float        _maxVariation,
-                     float        _minDiversity)
+MSER_Impl::MSER_Impl(cv::Size _imgSize,
+                     int _numNeighbors,
+                     int _delta,
+                     int _minArea,
+                     int _maxArea,
+                     float _maxVariation,
+                     float _minDiversity)
 {
     CV_Assert(_imgSize.width > 50);
     CV_Assert(_imgSize.height > 5);
@@ -244,16 +244,17 @@ void MSER_Impl::detect(InputArray src, std::vector<std::vector<Point>>& contours
     this->detectRegions(src, contours, boundingBoxes, contourData, /*useBoundingBoxes*/ true, /*useContourData*/ true);
 }
 
-Ptr<FCVMSER> FCVMSER::create(cv::Size     imgSize,
-                             uint32_t numNeighbors,
-                             uint32_t delta,
-                             uint32_t minArea,
-                             uint32_t maxArea,
-                             float        maxVariation,
-                             float        minDiversity)
+Ptr<FCVMSER> FCVMSER::create(const cv::Size& imgSize,
+                             int numNeighbors,
+                             int delta,
+                             int minArea,
+                             int maxArea,
+                             float maxVariation,
+                             float minDiversity)
 {
+    CV_Assert(numNeighbors > 0 && delta >= 0 && minArea >= 0 && maxArea >= 0);
     return makePtr<MSER_Impl>(imgSize, numNeighbors, delta, minArea, maxArea, maxVariation, minDiversity);
 }
 
 } // fastcv::
-} // cv::
\ No newline at end of file
+} // cv::
diff --git a/modules/fastcv/src/thresh.cpp b/modules/fastcv/src/thresh.cpp
index c97a3656039..5f5c95537e0 100644
--- a/modules/fastcv/src/thresh.cpp
+++ b/modules/fastcv/src/thresh.cpp
@@ -8,10 +8,15 @@
 namespace cv {
 namespace fastcv {
 
-void thresholdRange(InputArray _src, OutputArray _dst, uint8_t lowThresh, uint8_t highThresh, uint8_t trueValue, uint8_t falseValue)
+void thresholdRange(InputArray _src, OutputArray _dst, int lowThresh, int highThresh, int trueValue, int falseValue)
 {
     INITIALIZATION_CHECK;
 
+    CV_Assert(lowThresh >= 0 && lowThresh < 256);
+    CV_Assert(highThresh >= 0 && highThresh < 256);
+    CV_Assert(falseValue >= 0 && falseValue < 256);
+    CV_Assert(trueValue >= 0 && trueValue < 256);
+
     CV_Assert(lowThresh <= highThresh);
 
     CV_Assert(!_src.empty() && _src.type() == CV_8UC1);
diff --git a/modules/fastcv/test/test_mser.cpp b/modules/fastcv/test/test_mser.cpp
index 29cae5808a7..d3cb35bf47e 100644
--- a/modules/fastcv/test/test_mser.cpp
+++ b/modules/fastcv/test/test_mser.cpp
@@ -175,4 +175,4 @@ INSTANTIATE_TEST_CASE_P(FastCV_Extension, MSERTest,
                        ::testing::Values("cv/shared/baboon.png", "cv/mser/puzzle.png")
                       )
     );
-}} // namespaces opencv_test, ::
\ No newline at end of file
+}} // namespaces opencv_test, ::

From 57e0e44a019ce16dea59f6587a14a90eb37cc2f9 Mon Sep 17 00:00:00 2001
From: Peter Rekdal Khan-Sunde <peter.sunde@gmail.com>
Date: Fri, 3 Jan 2025 19:33:12 +0100
Subject: [PATCH 11/14] Split 4D function pointer array into per-surface-format
 arrays for GCC 8 compatibility.

---
 ...idia_surface_format_to_color_converter.cpp | 131 +++++++++++++++---
 1 file changed, 108 insertions(+), 23 deletions(-)

diff --git a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
index e22549e2296..ff9aa5708c4 100644
--- a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
+++ b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
@@ -142,36 +142,101 @@ class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
         const bool yuv420 = surfaceFormat == SurfaceFormat::SF_NV12 || surfaceFormat == SurfaceFormat::SF_P016;
         CV_Assert(yuv.cols() % 2 == 0);
 
-        typedef void (*func_t)(uint8_t* yuv, int yuvPitch, uint8_t* color, int colorPitch, int width, int height, bool videoFullRangeFlag, cudaStream_t stream);
-        static const func_t funcs[4][5][2][2] =
+        using func_t = void (*)(uint8_t* yuv, int yuvPitch, uint8_t* color, int colorPitch, int width, int height, bool videoFullRangeFlag, cudaStream_t stream);
+
+        static const func_t funcsNV12[5][2][2] =
+        {
+            {
+                { Nv12ToColor24<BGR24>, Nv12ToColorPlanar24<BGR24> },
+                { Nv12ToColor48<BGR48>, Nv12ToColorPlanar48<BGR48> }
+            },
+            {
+                { Nv12ToColor24<RGB24>, Nv12ToColorPlanar24<RGB24> },
+                { Nv12ToColor48<RGB48>, Nv12ToColorPlanar48<RGB48> }
+            },
+            {
+                { Nv12ToColor32<BGRA32>, Nv12ToColorPlanar32<BGRA32> },
+                { Nv12ToColor64<BGRA64>, Nv12ToColorPlanar64<BGRA64> }
+            },
+            {
+                { Nv12ToColor32<RGBA32>, Nv12ToColorPlanar32<RGBA32> },
+                { Nv12ToColor64<RGBA64>, Nv12ToColorPlanar64<RGBA64> }
+            },
+            {
+                { Y8ToGray8, Y8ToGray8 },
+                { Y8ToGray16, Y8ToGray16 }
+            }
+        };
+
+        static const func_t funcsP016[5][2][2] =
+        {
+            {
+                { P016ToColor24<BGR24>, P016ToColorPlanar24<BGR24> },
+                { P016ToColor48<BGR48>, P016ToColorPlanar48<BGR48> }
+            },
+            {
+                { P016ToColor24<RGB24>, P016ToColorPlanar24<RGB24> },
+                { P016ToColor48<RGB48>, P016ToColorPlanar48<RGB48> }
+            },
+            {
+                { P016ToColor32<BGRA32>, P016ToColorPlanar32<BGRA32> },
+                { P016ToColor64<BGRA64>, P016ToColorPlanar64<BGRA64> }
+            },
+            {
+                { P016ToColor32<RGBA32>, P016ToColorPlanar32<RGBA32> },
+                { P016ToColor64<RGBA64>, P016ToColorPlanar64<RGBA64> }
+            },
+            {
+                { Y16ToGray8, Y16ToGray8 },
+                { Y16ToGray16, Y16ToGray16 }
+            }
+        };
+
+        static const func_t funcsYUV444[5][2][2] =
         {
             {
-                {{{Nv12ToColor24<BGR24>},{Nv12ToColorPlanar24<BGR24>}},{{Nv12ToColor48<BGR48>},{Nv12ToColorPlanar48<BGR48>}}},
-                {{{Nv12ToColor24<RGB24>},{Nv12ToColorPlanar24<RGB24>}},{{Nv12ToColor48<RGB48>},{Nv12ToColorPlanar48<RGB48>}}},
-                {{{Nv12ToColor32<BGRA32>},{Nv12ToColorPlanar32<BGRA32>}},{{Nv12ToColor64<BGRA64>},{Nv12ToColorPlanar64<BGRA64>}}},
-                {{{Nv12ToColor32<RGBA32>},{Nv12ToColorPlanar32<RGBA32>}},{{Nv12ToColor64<RGBA64>},{Nv12ToColorPlanar64<RGBA64>}}},
-                {{{Y8ToGray8},{Y8ToGray8}},{{Y8ToGray16},{Y8ToGray16}}}
+                { YUV444ToColor24<BGR24>, YUV444ToColorPlanar24<BGR24> },
+                { YUV444ToColor48<BGR48>, YUV444ToColorPlanar48<BGR48> }
+            },
+            {
+                { YUV444ToColor24<RGB24>, YUV444ToColorPlanar24<RGB24> },
+                { YUV444ToColor48<RGB48>, YUV444ToColorPlanar48<RGB48> }
             },
             {
-                {{{P016ToColor24<BGR24>},{P016ToColorPlanar24<BGR24>}},{{P016ToColor48<BGR48>},{P016ToColorPlanar48<BGR48>}}},
-                {{{P016ToColor24<RGB24>},{P016ToColorPlanar24<RGB24>}},{{P016ToColor48<RGB48>},{P016ToColorPlanar48<RGB48>}}},
-                {{{P016ToColor32<BGRA32>},{P016ToColorPlanar32<BGRA32>}},{{P016ToColor64<BGRA64>},{P016ToColorPlanar64<BGRA64>}}},
-                {{{P016ToColor32<RGBA32>},{P016ToColorPlanar32<RGBA32>}},{{P016ToColor64<RGBA64>},{P016ToColorPlanar64<RGBA64>}}},
-                {{{Y16ToGray8},{Y16ToGray8}},{{Y16ToGray16},{Y16ToGray16}}}
+                { YUV444ToColor32<BGRA32>, YUV444ToColorPlanar32<BGRA32> },
+                { YUV444ToColor64<BGRA64>, YUV444ToColorPlanar64<BGRA64> }
             },
             {
-                {{{YUV444ToColor24<BGR24>},{YUV444ToColorPlanar24<BGR24>}},{{YUV444ToColor48<BGR48>},{YUV444ToColorPlanar48<BGR48>}}},
-                {{{YUV444ToColor24<RGB24>},{YUV444ToColorPlanar24<RGB24>}},{{YUV444ToColor48<RGB48>},{YUV444ToColorPlanar48<RGB48>}}},
-                {{{YUV444ToColor32<BGRA32>},{YUV444ToColorPlanar32<BGRA32>}},{{YUV444ToColor64<BGRA64>},{YUV444ToColorPlanar64<BGRA64>}}},
-                {{{YUV444ToColor32<RGBA32>},{YUV444ToColorPlanar32<RGBA32>}},{{YUV444ToColor64<RGBA64>},{YUV444ToColorPlanar64<RGBA64>}}},
-                {{{Y8ToGray8},{Y8ToGray8}},{{Y8ToGray16},{Y8ToGray16}}}
+                { YUV444ToColor32<RGBA32>, YUV444ToColorPlanar32<RGBA32> },
+                { YUV444ToColor64<RGBA64>, YUV444ToColorPlanar64<RGBA64> }
             },
             {
-                {{{YUV444P16ToColor24<BGR24>},{YUV444P16ToColorPlanar24<BGR24>}},{{YUV444P16ToColor48<BGR48>},{YUV444P16ToColorPlanar48<BGR48>}}},
-                {{{YUV444P16ToColor24<RGB24>},{YUV444P16ToColorPlanar24<RGB24>}},{{YUV444P16ToColor48<RGB48>},{YUV444P16ToColorPlanar48<RGB48>}}},
-                {{{YUV444P16ToColor32<BGRA32>},{YUV444P16ToColorPlanar32<BGRA32>}},{{YUV444P16ToColor64<BGRA64>},{YUV444P16ToColorPlanar64<BGRA64>}}},
-                {{{YUV444P16ToColor32<RGBA32>},{YUV444P16ToColorPlanar32<RGBA32>}},{{YUV444P16ToColor64<RGBA64>},{YUV444P16ToColorPlanar64<RGBA64>}}},
-                {{{Y16ToGray8},{Y16ToGray8}},{{Y16ToGray16},{Y16ToGray16}}}
+                { Y8ToGray8, Y8ToGray8 },
+                { Y8ToGray16, Y8ToGray16 }
+            }
+        };
+
+        static const func_t funcsYUV444P16[5][2][2] =
+        {
+            {
+                { YUV444P16ToColor24<BGR24>, YUV444P16ToColorPlanar24<BGR24> },
+                { YUV444P16ToColor48<BGR48>, YUV444P16ToColorPlanar48<BGR48> }
+            },
+            {
+                { YUV444P16ToColor24<RGB24>, YUV444P16ToColorPlanar24<RGB24> },
+                { YUV444P16ToColor48<RGB48>, YUV444P16ToColorPlanar48<RGB48> }
+            },
+            {
+                { YUV444P16ToColor32<BGRA32>, YUV444P16ToColorPlanar32<BGRA32> },
+                { YUV444P16ToColor64<BGRA64>, YUV444P16ToColorPlanar64<BGRA64> }
+            },
+            {
+                { YUV444P16ToColor32<RGBA32>, YUV444P16ToColorPlanar32<RGBA32> },
+                { YUV444P16ToColor64<RGBA64>, YUV444P16ToColorPlanar64<RGBA64> }
+            },
+            {
+                { Y16ToGray8, Y16ToGray8 },
+                { Y16ToGray16, Y16ToGray16 }
             }
         };
 
@@ -183,11 +248,31 @@ class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
         const int nChannels = NumChannels(outputFormat);
         const int nRowsOut = nRows * (planar ? nChannels : 1);
         const BitDepth bitDepth_ = GetBitDepthOut(bitDepth, yuv.depth());
+        const int iBitDepth = bitDepth_ == BitDepth::EIGHT ? 0 : 1;
         const int typeOut = CV_MAKE_TYPE(bitDepth_ == BitDepth::EIGHT ? CV_8U : CV_16U, planar ? 1 : nChannels);
         GpuMat out_ = getOutputMat(out, nRowsOut, yuv.cols(), typeOut, stream);
 
+        const int iSurfaceFormat = static_cast<int>(surfaceFormat);
+        const int iPlanar = planar ? 1 : 0;
         const int iOutputFormat = OutputColorFormatIdx(outputFormat);
-        const func_t func = funcs[static_cast<int>(surfaceFormat)][iOutputFormat][static_cast<int>(bitDepth_)][planar];
+        func_t func = nullptr;
+
+        switch (iSurfaceFormat)
+        {
+        case 0:
+            func = funcsNV12[iOutputFormat][iBitDepth][iPlanar];
+            break;
+        case 1:
+            func = funcsP016[iOutputFormat][iBitDepth][iPlanar];
+            break;
+        case 2:
+            func = funcsYUV444[iOutputFormat][iBitDepth][iPlanar];
+            break;
+        case 3:
+            func = funcsYUV444P16[iOutputFormat][iBitDepth][iPlanar];
+            break;
+        }
+
         if (!func)
             CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 

From 6131706f251ae09bf4c4e09c236c2d5fb3d127b6 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Mon, 30 Dec 2024 13:59:55 +0200
Subject: [PATCH 12/14] cudacodec - Add 10 bit YUV420 and YUV444 encoding
 functionality

---
 .../cudacodec/include/opencv2/cudacodec.hpp   |  2 +
 modules/cudacodec/src/ffmpeg_video_source.cpp | 19 ------
 modules/cudacodec/src/video_decoder.cpp       | 42 ++++++++++++-
 modules/cudacodec/src/video_writer.cpp        | 12 ++--
 modules/cudacodec/test/test_video.cpp         | 61 +++++++++++++++++++
 5 files changed, 110 insertions(+), 26 deletions(-)

diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index a0c039189e9..307fa79edae 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -106,6 +106,8 @@ enum ColorFormat {
     NV_IYUV = 9, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes]. VideoWriter only.
     NV_YUV444 = 10, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes]. VideoWriter only.
     NV_AYUV = 11, //!< Nvidia Buffer Format - 8 bit Packed A8Y8U8V8. This is a word-ordered format where a pixel is represented by a 32-bit word with V in the lowest 8 bits, U in the next 8 bits, Y in the 8 bits after that and A in the highest 8 bits. VideoWriter only.
+    NV_YUV420_10BIT = 12, //!< Nvidia Buffer Format - 10 bit Semi-Planar YUV [Y plane followed by interleaved UV plane]. Each pixel of size 2 bytes. Most Significant 10 bits contain pixel data. VideoWriter only.
+    NV_YUV444_10BIT = 13, //!< Nvidia Buffer Format - 10 bit Planar YUV444 [Y plane followed by U and V planes]. Each pixel of size 2 bytes. Most Significant 10 bits contain pixel data. VideoWriter only.
 #ifndef CV_DOXYGEN
     PROP_NOT_SUPPORTED
 #endif
diff --git a/modules/cudacodec/src/ffmpeg_video_source.cpp b/modules/cudacodec/src/ffmpeg_video_source.cpp
index 6296383db63..aa92889d55e 100644
--- a/modules/cudacodec/src/ffmpeg_video_source.cpp
+++ b/modules/cudacodec/src/ffmpeg_video_source.cpp
@@ -89,23 +89,6 @@ Codec FourccToCodec(int codec)
     CV_Error(Error::StsUnsupportedFormat, msg);
 }
 
-static
-void FourccToChromaFormat(const int pixelFormat, ChromaFormat &chromaFormat, int & nBitDepthMinus8)
-{
-    switch (pixelFormat)
-    {
-    case CV_FOURCC_MACRO('I', '4', '2', '0'):
-        chromaFormat = YUV420;
-        nBitDepthMinus8 = 0;
-        break;
-    default:
-        CV_LOG_WARNING(NULL, cv::format("ChromaFormat not recognized: 0x%08X (%s). Assuming I420", pixelFormat, fourccToString(pixelFormat).c_str()));
-        chromaFormat = YUV420;
-        nBitDepthMinus8 = 0;
-        break;
-    }
-}
-
 static
 int StartCodeLen(unsigned char* data, const int sz) {
     if (sz >= 3 && data[0] == 0 && data[1] == 0 && data[2] == 1)
@@ -145,14 +128,12 @@ cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname,
         extraData = tmpExtraData.clone();
 
     int codec = (int)cap.get(CAP_PROP_FOURCC);
-    int pixelFormat = (int)cap.get(CAP_PROP_CODEC_PIXEL_FORMAT);
     format_.codec = FourccToCodec(codec);
     format_.height = cap.get(CAP_PROP_FRAME_HEIGHT);
     format_.width = cap.get(CAP_PROP_FRAME_WIDTH);
     format_.displayArea = Rect(0, 0, format_.width, format_.height);
     format_.valid = false;
     format_.fps = cap.get(CAP_PROP_FPS);
-    FourccToChromaFormat(pixelFormat, format_.chromaFormat, format_.nBitDepthMinus8);
 }
 
 cv::cudacodec::detail::FFmpegVideoSource::~FFmpegVideoSource()
diff --git a/modules/cudacodec/src/video_decoder.cpp b/modules/cudacodec/src/video_decoder.cpp
index e156e25a705..0377767f025 100644
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -45,7 +45,44 @@
 
 #ifdef HAVE_NVCUVID
 
-#if (CUDART_VERSION < 9000)
+#if (CUDART_VERSION >= 9000)
+static const char* GetVideoCodecString(cudaVideoCodec eCodec) {
+    static struct {
+        cudaVideoCodec eCodec;
+        const char* name;
+    } aCodecName[] = {
+        { cudaVideoCodec_MPEG1,     "MPEG-1"       },
+        { cudaVideoCodec_MPEG2,     "MPEG-2"       },
+        { cudaVideoCodec_MPEG4,     "MPEG-4 (ASP)" },
+        { cudaVideoCodec_VC1,       "VC-1/WMV"     },
+        { cudaVideoCodec_H264,      "AVC/H.264"    },
+        { cudaVideoCodec_JPEG,      "M-JPEG"       },
+        { cudaVideoCodec_H264_SVC,  "H.264/SVC"    },
+        { cudaVideoCodec_H264_MVC,  "H.264/MVC"    },
+        { cudaVideoCodec_HEVC,      "H.265/HEVC"   },
+        { cudaVideoCodec_VP8,       "VP8"          },
+        { cudaVideoCodec_VP9,       "VP9"          },
+        { cudaVideoCodec_AV1,       "AV1"          },
+        { cudaVideoCodec_NumCodecs, "Invalid"      },
+        { cudaVideoCodec_YUV420,    "YUV  4:2:0"   },
+        { cudaVideoCodec_YV12,      "YV12 4:2:0"   },
+        { cudaVideoCodec_NV12,      "NV12 4:2:0"   },
+        { cudaVideoCodec_YUYV,      "YUYV 4:2:2"   },
+        { cudaVideoCodec_UYVY,      "UYVY 4:2:2"   },
+    };
+
+    if (eCodec >= 0 && eCodec <= cudaVideoCodec_NumCodecs) {
+        return aCodecName[eCodec].name;
+    }
+    for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(aCodecName) / sizeof(aCodecName[0]); i++) {
+        if (eCodec == aCodecName[i].eCodec) {
+            return aCodecName[eCodec].name;
+        }
+    }
+    return "Unknown";
+}
+#endif
+
 static const char* GetVideoChromaFormatString(cudaVideoChromaFormat eChromaFormat) {
     static struct {
         cudaVideoChromaFormat eChromaFormat;
@@ -62,7 +99,6 @@ static const char* GetVideoChromaFormatString(cudaVideoChromaFormat eChromaForma
     }
     return "Unknown";
 }
-#endif
 
 void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
 {
@@ -141,7 +177,7 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
     cuSafeCall(cuCtxPopCurrent(NULL));
 
     if (!decodeCaps.bIsSupported) {
-        CV_Error(Error::StsUnsupportedFormat, "Video codec is not supported by this GPU hardware video decoder refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of the video source's codec.");
+        CV_Error(Error::StsUnsupportedFormat, std::to_string(decodeCaps.nBitDepthMinus8 + 8) + " bit " + GetVideoCodecString(_codec) + " with " + GetVideoChromaFormatString(_chromaFormat) + " chroma format is not supported by this GPU hardware video decoder.  Please refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of this video source.");
     }
 
     if (!(decodeCaps.nOutputFormatMask & (1 << surfaceFormat)))
diff --git a/modules/cudacodec/src/video_writer.cpp b/modules/cudacodec/src/video_writer.cpp
index d702acb0bfc..5bb1a533faf 100644
--- a/modules/cudacodec/src/video_writer.cpp
+++ b/modules/cudacodec/src/video_writer.cpp
@@ -188,6 +188,8 @@ NV_ENC_BUFFER_FORMAT EncBufferFormat(const ColorFormat colorFormat) {
     case ColorFormat::NV_IYUV: return NV_ENC_BUFFER_FORMAT_IYUV;
     case ColorFormat::NV_YUV444: return NV_ENC_BUFFER_FORMAT_YUV444;
     case ColorFormat::NV_AYUV: return NV_ENC_BUFFER_FORMAT_AYUV;
+    case ColorFormat::NV_YUV420_10BIT: return NV_ENC_BUFFER_FORMAT_YUV420_10BIT;
+    case ColorFormat::NV_YUV444_10BIT: return NV_ENC_BUFFER_FORMAT_YUV444_10BIT;
     default: return NV_ENC_BUFFER_FORMAT_UNDEFINED;
     }
 }
@@ -195,15 +197,17 @@ NV_ENC_BUFFER_FORMAT EncBufferFormat(const ColorFormat colorFormat) {
 int NChannels(const ColorFormat colorFormat) {
     switch (colorFormat) {
     case ColorFormat::BGR:
-    case ColorFormat::RGB:
-    case ColorFormat::NV_IYUV:
-    case ColorFormat::NV_YUV444: return 3;
+    case ColorFormat::RGB: return 3;
     case ColorFormat::RGBA:
     case ColorFormat::BGRA:
     case ColorFormat::NV_AYUV: return 4;
     case ColorFormat::GRAY:
     case ColorFormat::NV_NV12:
-    case ColorFormat::NV_YV12: return 1;
+    case ColorFormat::NV_IYUV:
+    case ColorFormat::NV_YV12:
+    case ColorFormat::NV_YUV420_10BIT:
+    case ColorFormat::NV_YUV444:
+    case ColorFormat::NV_YUV444_10BIT: return 1;
     default: return 0;
     }
 }
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 1158a3f6201..29f25b2ca1e 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -1065,6 +1065,67 @@ CUDA_TEST_P(H264ToH265, Transcode)
 
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, H264ToH265, ALL_DEVICES);
 
+CV_ENUM(YuvColorFormats, cudacodec::ColorFormat::NV_YUV444, cudacodec::ColorFormat::NV_YUV420_10BIT, cudacodec::ColorFormat::NV_YUV444_10BIT)
+PARAM_TEST_CASE(YUVFormats, cv::cuda::DeviceInfo, YuvColorFormats)
+{
+};
+
+CUDA_TEST_P(YUVFormats, Transcode)
+{
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h265";
+    const cv::cudacodec::ColorFormat writerColorFormat = static_cast<cudacodec::ColorFormat>(static_cast<int>(GET_PARAM(1)));
+    constexpr double fps = 25;
+    const cudacodec::Codec codec = cudacodec::Codec::HEVC;
+    const std::string ext = ".mp4";
+    const std::string outputFile = cv::tempfile(ext.c_str());
+    constexpr int nFrames = 5;
+    vector<Mat> bgrGs;
+    {
+        VideoCapture cap(inputFile);
+        cv::Ptr<cv::cudacodec::VideoWriter> writer;
+        Mat frame, yuv, bgr;
+        cv::cudacodec::EncoderParams params;
+        params.tuningInfo = cv::cudacodec::EncodeTuningInfo::ENC_TUNING_INFO_LOSSLESS;
+        params.rateControlMode = cv::cudacodec::EncodeParamsRcMode::ENC_PARAMS_RC_CONSTQP;
+        for (int i = 0; i < nFrames; ++i) {
+            ASSERT_TRUE(cap.read(frame));
+            ASSERT_FALSE(frame.empty());
+            cudacodec::SurfaceFormat yuvFormat = cudacodec::SurfaceFormat::SF_YUV444;
+            cudacodec::BitDepth bitDepth = cudacodec::BitDepth::EIGHT;
+            if (writerColorFormat == cudacodec::ColorFormat::NV_YUV444_10BIT) {
+                yuvFormat = cudacodec::SurfaceFormat::SF_YUV444_16Bit;
+                bitDepth = cudacodec::BitDepth::SIXTEEN;
+            }
+            else if (writerColorFormat == cudacodec::ColorFormat::NV_YUV420_10BIT){
+                yuvFormat = cudacodec::SurfaceFormat::SF_P016;
+                bitDepth = cudacodec::BitDepth::SIXTEEN;
+            }
+            generateTestImages(frame, yuv, bgr, yuvFormat, cudacodec::ColorFormat::BGR, bitDepth, false);
+            bgrGs.push_back(bgr.clone());
+            if (writer.empty())
+                writer = cv::cudacodec::createVideoWriter(outputFile, frame.size(), codec, fps, writerColorFormat, params);
+            writer->write(yuv);
+        }
+    }
+
+    {
+        cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(outputFile);
+        reader->set(cudacodec::ColorFormat::BGR);
+        cv::cuda::GpuMat frame, frameGs;
+        Mat frameHost, frameGsHost;
+        for (int i = 0; i < nFrames; ++i) {
+            ASSERT_TRUE(reader->nextFrame(frame));
+            frame.download(frameHost);
+            frameGsHost = bgrGs[i];
+            const int diff = writerColorFormat == cudacodec::ColorFormat::NV_YUV420_10BIT || writerColorFormat == cudacodec::ColorFormat::NV_YUV444_10BIT ? 512 : 1;
+            EXPECT_MAT_NEAR(frameHost, frameGsHost, diff);
+        }
+    }
+    ASSERT_EQ(0, remove(outputFile.c_str()));
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, YUVFormats, testing::Combine(ALL_DEVICES, YuvColorFormats::all()));
 #endif
 
 #if defined(HAVE_NVCUVENC)

From 5a6d80a279b2e4673ee2beb800d9be36d4343087 Mon Sep 17 00:00:00 2001
From: Skreg <85214856+shyama7004@users.noreply.github.com>
Date: Wed, 8 Jan 2025 19:29:59 +0530
Subject: [PATCH 13/14] Merge pull request #3864 from
 shyama7004:fix-unused-variable

Fix Unused Variable Warning in test_rotation_and_scale_invariance.cpp #3864

### Description:
This PR resolves a warning caused by an unused variable `IMAGE_BIKES` in the file `test_rotation_and_scale_invariance.cpp` within the `xfeatures2d` module of the OpenCV contrib repository.

- **Issue:** The variable `IMAGE_BIKES` was declared but not used in the test file, leading to a compiler warning (`-Wunused-const-variable`).

```
warning: unused variable 'IMAGE_BIKES' [-Wunused-const-variable]
static const char* const IMAGE_BIKES = "detectors_descriptors_evaluation/
```

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 .../xfeatures2d/test/test_rotation_and_scale_invariance.cpp    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp b/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp
index ce7281d6c06..4ec9a573500 100644
--- a/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp
+++ b/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp
@@ -10,8 +10,9 @@
 namespace opencv_test { namespace {
 
 static const char* const IMAGE_TSUKUBA = "features2d/tsukuba.png";
+#ifdef OPENCV_ENABLE_NONFREE
 static const char* const IMAGE_BIKES = "detectors_descriptors_evaluation/images_datasets/bikes/img1.png";
-
+#endif // OPENCV_ENABLE_NONFREE
 // ========================== ROTATION INVARIANCE =============================
 
 #ifdef OPENCV_ENABLE_NONFREE

From 9060b318e5201e94187fd68a152d56eacf068931 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@opencv.ai>
Date: Wed, 8 Jan 2025 18:48:23 +0300
Subject: [PATCH 14/14] xFeatures2d build fix.

---
 modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp b/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp
index 4ec9a573500..5eef3828c14 100644
--- a/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp
+++ b/modules/xfeatures2d/test/test_rotation_and_scale_invariance.cpp
@@ -10,7 +10,7 @@
 namespace opencv_test { namespace {
 
 static const char* const IMAGE_TSUKUBA = "features2d/tsukuba.png";
-#ifdef OPENCV_ENABLE_NONFREE
+#if defined(OPENCV_ENABLE_NONFREE) || defined (OPENCV_XFEATURES2D_HAS_VGG_DATA)
 static const char* const IMAGE_BIKES = "detectors_descriptors_evaluation/images_datasets/bikes/img1.png";
 #endif // OPENCV_ENABLE_NONFREE
 // ========================== ROTATION INVARIANCE =============================