diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp index 2284dbee5..db1492911 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp @@ -30,7 +30,7 @@ std::map codecMap = { {"vp8", cudaVideoCodec_VP8}, {"vp9", cudaVideoCodec_VP9}, {"av1", cudaVideoCodec_AV1}}; /** - * @brief Function to decode media file using OptimizedNvDecoder interface + * @brief Function to decode video file using OptimizedNvDecoder interface * @param pDec - Handle to OptimizedNvDecoder * @param demuxer - Pointer to an FFmpegDemuxer instance * @param pnFrame - Variable to record the number of frames decoded @@ -41,13 +41,13 @@ void DecProc(OptimizedNvDecoder *pDec, const char *szInFilePath, int *pnFrame, s std::unique_ptr demuxer(new FFmpegDemuxer(szInFilePath)); int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0; uint8_t *pVideo = NULL, *pFrame = NULL; - do { + // Demux video from file using FFmpegDemuxer demuxer->Demux(&pVideo, &nVideoBytes); + // Decode the video frame from demuxed packet nFrameReturned = pDec->Decode(pVideo, nVideoBytes); if (!nFrame && nFrameReturned) LOG(INFO) << pDec->GetVideoInfo(); - nFrame += nFrameReturned; } while (nVideoBytes); *pnFrame = nFrame; @@ -56,6 +56,9 @@ void DecProc(OptimizedNvDecoder *pDec, const char *szInFilePath, int *pnFrame, s } } +/** + * @brief Function to show help message and exit + */ void ShowHelpAndExit(const char *szBadOption = NULL) { std::ostringstream oss; bool bThrowError = false; @@ -64,19 +67,15 @@ void ShowHelpAndExit(const char *szBadOption = NULL) { oss << "Error parsing \"" << szBadOption << "\"" << std::endl; } oss << "Options:" << std::endl - << "-i Input file path" << std::endl + << "-i Input single video file path" << std::endl << "-o Output file path" << std::endl << "-gpu Ordinal of GPU to use" << std::endl << "-thread Number of decoding thread" << std::endl << "-total Number of total video to test" << std::endl - << "-single (No value) Use single context (this may result in suboptimal performance; default is multiple " - "contexts)" - << std::endl - << "-host (No value) Copy frame to host memory (this may result in suboptimal performance; default is " - "device memory)" - << std::endl - << "-multi_input Multiple Input file list path" << std::endl - << "-codec The codecc of video to test" << std::endl; + << "-single (No value) Use single context (default is multi-context, one context per thread)" << std::endl + << "-host (No value) Copy frame to host memory (default is device memory)" << std::endl + << "-multi_input The file path which lists the path of multiple video in each line" << std::endl + << "-codec The codec of video to test" << std::endl; if (bThrowError) { throw std::invalid_argument(oss.str()); } else { @@ -85,6 +84,9 @@ void ShowHelpAndExit(const char *szBadOption = NULL) { } } +/** + * @brief Function to parse commandline arguments + */ void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu, int &nThread, int &nTotalVideo, bool &bSingle, bool &bHost, std::string &inputFilesListPath, std::string &outputFile, cudaVideoCodec &codec) { @@ -161,6 +163,9 @@ void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu, } } +/** + * @brief Function to create cuda context and initialize decoder + */ OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle, bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) { if (!bSingle) { @@ -171,10 +176,12 @@ OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUco return sessionObject; } +/** + * @brief Function to convert time_point to human-readable format + */ std::string GetTime(const std::chrono::_V2::system_clock::time_point &now) { // Convert the time_point to a time_t auto now_time_t = std::chrono::system_clock::to_time_t(now); - // Convert the time_t to a human-readable format std::tm *now_tm = std::localtime(&now_time_t); char time_cstr[100]; @@ -183,9 +190,12 @@ std::string GetTime(const std::chrono::_V2::system_clock::time_point &now) { return time_str; } -double DecodeVideo(size_t i, const std::vector &vDec, const char *szInFilePath, int *pnFrame, +/** + * @brief Function to decode a video in a thread and measure the latency + */ +double DecodeVideo(size_t thread, const std::vector &vDec, const char *szInFilePath, int *pnFrame, std::exception_ptr &ex) { - OptimizedNvDecoder *pDec = vDec[i]; + OptimizedNvDecoder *pDec = vDec[thread]; auto start = std::chrono::high_resolution_clock::now(); DecProc(pDec, szInFilePath, pnFrame, ex); auto end = std::chrono::high_resolution_clock::now(); @@ -195,6 +205,9 @@ double DecodeVideo(size_t i, const std::vector &vDec, cons return elapsedTime / 1000.0f; } +/** + * @brief Function to read the video paths from a file + */ std::vector ReadMultipleVideoFiles(std::string filepath) { std::ifstream file(filepath); if (!file) { @@ -210,6 +223,9 @@ std::vector ReadMultipleVideoFiles(std::string filepath) { return tokens; } +/** + * @brief Function to get the decoder capability + */ void GetDefaultDecoderCaps(CUVIDDECODECAPS &decodecaps, cudaVideoCodec codec) { memset(&decodecaps, 0, sizeof(decodecaps)); decodecaps.eCodecType = codec; @@ -218,6 +234,10 @@ void GetDefaultDecoderCaps(CUVIDDECODECAPS &decodecaps, cudaVideoCodec codec) { NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps)); } +/** + * @brief Function to initialize the cuda device, cuda context, query the decoder capability and create decoder for + * each thread + */ void InitializeContext(std::vector &vDec, int iGpu, int nThread, bool bSingle, bool bHost, cudaVideoCodec codec) { ck(cuInit(0)); @@ -250,6 +270,9 @@ void InitializeContext(std::vector &vDec, int iGpu, int nT } } +/** + * @brief Function to write the latency and FPS data of each video to a file + */ void WriteRawData(const std::vector &data, std::vector &frames, std::string filename) { // Open the output file stream std::ofstream outputFile(filename); @@ -265,6 +288,9 @@ void WriteRawData(const std::vector &data, std::vector &frames, std outputFile.close(); } +/** + * @brief Function to calculate the statistical latency metrics + */ std::tuple CalLatencyMetrics(const std::vector &originData) { std::vector data = originData; @@ -280,6 +306,11 @@ CalLatencyMetrics(const std::vector &originData) { return std::make_tuple(sum, mean, min, max, p50, p90, p95, p99); } +/** + * @brief Function to generate the total file list for the given total number of videos. + * If the number of videos is less than the total number of videos, the list will be repeated. + * If the number of videos is greater than the total number of videos, the list will be truncated. + */ std::vector GenerateTotalFileList(std::string inputFilesListPath, int nTotalVideo, const char *szInFilePath) { std::vector files; @@ -309,6 +340,10 @@ std::vector GenerateTotalFileList(std::string inputFilesListPath, i return files; } +/** + * @brief Function to run the decoding tasks in parallel with thread pool to decode all the videos and record the total + * latency and the total number of frames + */ float run(std::vector &vDec, int nThread, std::vector &files, std::vector &vnFrame, std::vector &vExceptionPtrs, int *nTotalFrames, std::vector &vnLatency) { diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h index b92ee4218..e57ee0486 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h @@ -3,6 +3,7 @@ #include "NvDecoder/NvDecoder.h" +// This class is derived from NvDecoder class and is used to optimize the cuvidGetDecoderCaps overhead class OptimizedNvDecoder : public NvDecoder { public: @@ -11,6 +12,8 @@ class OptimizedNvDecoder : public NvDecoder { * @brief This function is used to initialize the decoder session. * Application must call this function to initialize the decoder, before * starting to decode any frames. + * The only difference from the original function is to add a new member m_decodecaps. + * Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK. */ OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, CUVIDDECODECAPS decodecaps, bool bLowLatency = false, bool bDeviceFramePitched = false, const Rect *pCropRect = NULL, @@ -25,7 +28,9 @@ class OptimizedNvDecoder : public NvDecoder { return ((OptimizedNvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat); } /** - * @brief Define the new handler when decoding of sequence starts + * @brief Define the new handler when decoding of sequence starts. + * The only change is to re-query decoder caps when the video codec or format change + * Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK. */ int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat); diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h index da9f65f2c..5592b76e7 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h @@ -9,14 +9,21 @@ #include #include +// ThreadPool is a simple thread pool implementation that supports enqueueing the task with the index of thread to use +// and custom arguments like task(thread_index, *args). class ThreadPool { public: + /** + * @brief Construct a new ThreadPool object with the given number of threads. + */ ThreadPool(size_t numThreads) { for (size_t i = 0; i < numThreads; ++i) { threads.emplace_back(&ThreadPool::worker, this, i); } } - + /** + * @brief Destroy the ThreadPool object and join all threads. + */ ~ThreadPool() { { std::unique_lock lock(mutex); @@ -28,7 +35,10 @@ class ThreadPool { thread.join(); } } - + /** + * @brief TaskWrapper is a wrapper of the task with the index of thread to use and custom arguments like + * task(thread_index, *args). + */ template struct TaskWrapper { std::shared_ptr> task; @@ -39,7 +49,9 @@ class ThreadPool { void operator()(size_t threadIdx) { (*task)(threadIdx); } }; - + /** + * @brief Enqueue enqueues the task with custom arguments and return the results of task when finished. + */ template auto enqueue(F &&f, Args &&...args) -> std::future::type> { using ReturnType = typename std::result_of::type; @@ -57,6 +69,9 @@ class ThreadPool { } private: + /** + * @brief The worker function that dequeues the task and executes it for each thread index. + */ void worker(size_t threadIdx) { while (true) { std::function task;