Hello,
I’m trying to develop an hardware decoder to get a CUDA context directly after the video stream decoding and avoiding CPU <=> GPU transfert time.
I am using Codec_SDK 9.0.20 and I built my C++ class based on samples/NvDecoder/NvDecoder.cpp
.
main.cpp
#include <stdlib.h>
#include "FFMPEG/Demuxer.hpp"
#include "NVIDIA_CODEC_SDK/Decoder.hpp"
#include "CUDA/Grayscale.hpp"
int main(int argc, char **argv)
{
try {
uint8_t *compressedVideoFrame;
int compressedVideoFrameSize,
n = 0;
auto demuxer = std::make_shared<FFMPEG::Demuxer>(argv[1]);
NvDecoder::Decoder decoder(demuxer);
decoder.setCudaFunction(Grayscale::run);
do {
demuxer->demux(compressedVideoFrame, compressedVideoFrameSize);
decoder.decode(compressedVideoFrame, compressedVideoFrameSize, n++);
} while (compressedVideoFrameSize > 0);
} catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
Decoder.cpp
#include "Decoder.hpp"
namespace NvDecoder {
Decoder::Decoder(const DemuxerSharedPtr &demuxer): demuxer(demuxer)
{
this->decoderCapacities = { };
this->codec = ffmpegToNvCodecId(this->demuxer->getCodecId());
NVDEC_API_CALL(cuInit(0));
NVDEC_API_CALL(cuCtxCreate(&this->cudaContext, 0, 0));
this->retrievesDecoderCapacities();
this->fillDecoderCreateInfo();
this->createVideoParser();
NVDEC_API_CALL(cuvidCreateDecoder(&this->decoder, &this->decoderCreateInfo));
NVDEC_API_CALL(cuStreamCreate(&this->cudaStream, CU_STREAM_DEFAULT));
}
/**
* @brief Destructor that free the resources
*/
Decoder::~Decoder()
{
cuCtxDestroy(this->cudaContext);
if (this->parser) {
cuvidDestroyVideoParser(this->parser);
}
if (this->decoder) {
cuvidDestroyDecoder(this->decoder);
}
}
void Decoder::setCudaFunction(const cudaProcess &cudaFunction)
{
Decoder::cudaFunction = cudaFunction;
}
/**
* @brief Get the compressed video frame data from the demuxer and decode it with NVIDIA hardware GPU
*
* @param compressedVideoFrame - The demuxer compressed video frame data
* @param compressedVideoFrameSize - The video frame size in bytes
* @param timestamp - Timestamp to mark the frame (@todo verify this parameter later)
*/
void Decoder::decode(const uint8_t *compressedVideoFrame, int compressedVideoFrameSize, int64_t timestamp)
{
CUVIDSOURCEDATAPACKET packet = {0};
packet.payload = compressedVideoFrame;
packet.payload_size = static_cast<unsigned long>(compressedVideoFrameSize);
packet.flags = CUVID_PKT_TIMESTAMP;
packet.timestamp = timestamp;
if (!compressedVideoFrame || compressedVideoFrameSize == 0) {
packet.flags |= CUVID_PKT_ENDOFSTREAM;
}
// @todo mutex ?
//Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket
NVDEC_API_CALL(cuvidParseVideoData(this->parser, &packet));
// => Calls back HandlePictureDecode with CUVIDPICPARAMS data for kicking of HW decoding
// => Calls back HandleVideoSequence with CUVIDEOFORMAT data for initial sequence header or when the decoder encounters a video format change
// => Calls back HandlePictureDisplay with CUVIDPARSERDISPINFO data to display a video frame
}
/**
* @brief Retrieves the decoder capacities from FFMPEG codec info and throws exception if the data stream is not
* supported.
*/
void Decoder::retrievesDecoderCapacities()
{
std::ostringstream errorString;
this->decoderCapacities.eCodecType = this->codec;
this->decoderCapacities.eChromaFormat = this->ffmpegToNvSubsampling(this->demuxer->getSubsampling());
this->decoderCapacities.nBitDepthMinus8 = static_cast<uint>(this->demuxer->getBitDepth() - 8);
NVDEC_API_CALL(cuvidGetDecoderCaps(&this->decoderCapacities));
if (!this->decoderCapacities.bIsSupported) {
errorString << std::endl << "Codec [" << getCodecName(this->codec) << "] not supported on this GPU";
throw Exception(errorString.str(), __FUNCTION__, __FILE__, __LINE__);
}
if (this->demuxer->getVideoWidth() > this->decoderCapacities.nMaxWidth ||
this->demuxer->getVideoHeight() > this->decoderCapacities.nMaxHeight
) {
errorString << std::endl
<< "Resolution : " << this->demuxer->getVideoWidth() << "x" << this->demuxer->getVideoHeight() << std::endl
<< "Max supported (wxh) : " << this->decoderCapacities.nMaxWidth << "x" << this->decoderCapacities.nMaxHeight << std::endl
<< "Resolution not supported on this GPU";
throw Exception(errorString.str(), __FUNCTION__, __FILE__, __LINE__);
}
if ((this->demuxer->getVideoWidth() >> 4) * (this->demuxer->getVideoHeight() >> 4) > this->decoderCapacities.nMaxMBCount) {
errorString << std::endl
<< "Macroblock count : " << (this->demuxer->getVideoWidth() >> 4) * (this->demuxer->getVideoHeight() >> 4) << std::endl
<< "Max Supported macroblock count : " << this->decoderCapacities.nMaxMBCount << std::endl
<< "Macroblock count not supported on this GPU";
throw Exception(errorString.str(), __FUNCTION__, __FILE__, __LINE__);
}
}
/**
* @brief Create the decoder info object with some tuned params
*/
void Decoder::fillDecoderCreateInfo()
{
this->decoderCreateInfo = { };
this->decoderCreateInfo.CodecType = this->codec;
this->decoderCreateInfo.ChromaFormat = this->decoderCapacities.eChromaFormat;
this->decoderCreateInfo.bitDepthMinus8 = this->decoderCapacities.nBitDepthMinus8;
this->decoderCreateInfo.ulWidth = static_cast<unsigned long>(this->demuxer->getVideoWidth());
this->decoderCreateInfo.ulHeight = static_cast<unsigned long>(this->demuxer->getVideoHeight());
this->decoderCreateInfo.ulTargetWidth = static_cast<unsigned long>(this->demuxer->getVideoWidth());
this->decoderCreateInfo.ulTargetHeight = static_cast<unsigned long>(this->demuxer->getVideoHeight());
this->decoderCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
this->decoderCreateInfo.ulIntraDecodeOnly = 1; // Control for memory optimization for I/IDR frame
// With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware
this->decoderCreateInfo.ulCreationFlags = cudaVideoCreate_Default;
// This is the number of surfaces that the client will use for storing the decoded frames. Using a higher
// number ensures better pipelining but increases GPU memory consumption. The driver internally allocates the
// corresponding number of surfaces. The NVDEC engine outputs decoded data to one of these surfaces.
this->decoderCreateInfo.ulNumDecodeSurfaces = 5;
// This is the maximum number of surfaces that the client will simultaneously map for further processing.
// The driver internally allocates the corresponding number of surfaces. Please refer to section 4.4 to
// understand the definition of map.
this->decoderCreateInfo.ulNumOutputSurfaces = 4;
if (this->decoderCapacities.eChromaFormat == cudaVideoChromaFormat::cudaVideoChromaFormat_420) {
this->decoderCreateInfo.OutputFormat = this->decoderCapacities.nBitDepthMinus8 == 0
? cudaVideoSurfaceFormat_NV12 : cudaVideoSurfaceFormat_P016;
} else if (this->decoderCapacities.eChromaFormat == cudaVideoChromaFormat::cudaVideoChromaFormat_444) {
this->decoderCreateInfo.OutputFormat = this->decoderCapacities.nBitDepthMinus8 == 0
? cudaVideoSurfaceFormat_YUV444 : cudaVideoSurfaceFormat_YUV444_16Bit;
} else {
std::cout << "WARNING: cudaVideoChromaFormat not handle in fillDecoderCreateInfo" << std::endl;
}
}
/**
* @brief Create the video parser instance
*/
void Decoder::createVideoParser()
{
CUVIDPARSERPARAMS videoParserParameters = { };
videoParserParameters.CodecType = this->codec;
videoParserParameters.ulMaxNumDecodeSurfaces = 1;
videoParserParameters.ulMaxDisplayDelay = 1; // 0 or 1, 2 , 3, 4
videoParserParameters.pUserData = this;
videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
videoParserParameters.pfnDisplayPicture = HandlePictureDisplayProc;
NVDEC_API_CALL(cuvidCreateVideoParser(&this->parser, &videoParserParameters));
}
/**
* @brief Convert a FFMPEG codec ID into a NVIDIA codec ID
*
* @param ffmpegCodecId - The FFMPEG codec ID
*
* @return The NVIDIA codec ID
*/
cudaVideoCodec Decoder::ffmpegToNvCodecId(AVCodecID ffmpegCodecId) const
{
switch (ffmpegCodecId) {
case AV_CODEC_ID_MPEG1VIDEO : return cudaVideoCodec_MPEG1;
case AV_CODEC_ID_MPEG2VIDEO : return cudaVideoCodec_MPEG2;
case AV_CODEC_ID_MPEG4 : return cudaVideoCodec_MPEG4;
case AV_CODEC_ID_VC1 : return cudaVideoCodec_VC1;
case AV_CODEC_ID_H264 : return cudaVideoCodec_H264;
case AV_CODEC_ID_HEVC : return cudaVideoCodec_HEVC;
case AV_CODEC_ID_VP8 : return cudaVideoCodec_VP8;
case AV_CODEC_ID_VP9 : return cudaVideoCodec_VP9;
case AV_CODEC_ID_MJPEG : return cudaVideoCodec_JPEG;
default : return cudaVideoCodec_NumCodecs;
}
}
/**
* @brief Convert a FFMPEG subsampling type to NVIDIA CODEC subsampling type
*
* @param subsamplingType - The FFMPEG subsampling type
*
* @return The NVIDIA CODEC subsampling type
*/
cudaVideoChromaFormat Decoder::ffmpegToNvSubsampling(FFMPEG::Demuxer::SubsamplingType subsamplingType) const
{
cudaVideoChromaFormat cudaSubsamplingType;
switch (subsamplingType) {
case FFMPEG::Demuxer::SubsamplingType::monochrome:
cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_Monochrome;
break;
case FFMPEG::Demuxer::SubsamplingType::format420:
cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_420;
break;
case FFMPEG::Demuxer::SubsamplingType::format422:
cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_422;
break;
case FFMPEG::Demuxer::SubsamplingType::format444:
cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_444;
break;
default:
cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_420;
}
return cudaSubsamplingType;
}
/**
* @brief Get the codec name from the given codec num
*
* @param eCodec - NVIDIA codec num
*
* @return The codec name
*/
std::string Decoder::getCodecName(cudaVideoCodec eCodec) const
{
static struct {
cudaVideoCodec eCodec;
std::string name;
} aCodecName [] = {
{ cudaVideoCodec_MPEG1, "MPEG-1" },
{ cudaVideoCodec_MPEG2, "MPEG-2" },
{ cudaVideoCodec_MPEG4, "MPEG-4 (ASP)" },
{ cudaVideoCodec_VC1, "VC-1/WMV" },
{ cudaVideoCodec_H264, "AVC/H.264" },
{ cudaVideoCodec_JPEG, "M-JPEG" },
{ cudaVideoCodec_H264_SVC, "H.264/SVC" },
{ cudaVideoCodec_H264_MVC, "H.264/MVC" },
{ cudaVideoCodec_HEVC, "H.265/HEVC" },
{ cudaVideoCodec_VP8, "VP8" },
{ cudaVideoCodec_VP9, "VP9" },
{ cudaVideoCodec_NumCodecs, "Invalid" },
{ cudaVideoCodec_YUV420, "YUV 4:2:0" },
{ cudaVideoCodec_YV12, "YV12 4:2:0" },
{ cudaVideoCodec_NV12, "NV12 4:2:0" },
{ cudaVideoCodec_YUYV, "YUYV 4:2:2" },
{ cudaVideoCodec_UYVY, "UYVY 4:2:2" },
};
if (eCodec >= 0 && eCodec <= cudaVideoCodec_NumCodecs) {
return aCodecName[eCodec].name;
}
for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(aCodecName) / sizeof(aCodecName[0]); i++) {
if (eCodec == aCodecName[i].eCodec) {
return aCodecName[eCodec].name;
}
}
return "Unknown";
}
// Callbacks
/**
* @brief Callback function to be registered for getting a callback when decoding of sequence starts
*
* @param pUserData - abstract pointer to link to the actual callback
* @param pVideoFormat - CUVIDEOFORMAT structure automatically fills by cuvidParseVideoData call
*
* @return 1 on success, 0 either
*/
int CUDAAPI Decoder::HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat)
{
return ((NvDecoder::Decoder *)pUserData)->HandleVideoSequence(pVideoFormat);
}
/**
* @brief Callback function to be registered for getting a callback when a decoded frame is ready to be decoded
*
* @param pUserData - abstract pointer to link to the actual callback
* @param pictureParams - CUVIDPICPARAMS structure automatically fills by cuvidParseVideoData call
*
* @return 1 on success, 0 either
*/
int CUDAAPI Decoder::HandlePictureDecodeProc(void *pUserData, CUVIDPICPARAMS *pictureParams)
{
return ((NvDecoder::Decoder *)pUserData)->HandlePictureDecode(pictureParams);
}
/**
* @brief Callback function to be registered for getting a callback when a decoded frame is available for display
*
* @param pUserData - abstract pointer to link to the actual callback
* @param pDispInfo - CUVIDPARSERDISPINFO structure automatically fills by cuvidParseVideoData call
*
* @return 1 on success, 0 either
*/
int CUDAAPI Decoder::HandlePictureDisplayProc(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo)
{
return ((NvDecoder::Decoder *)pUserData)->HandlePictureDisplay(pDispInfo);
}
/**
* @brief This function gets called when a sequence is ready to be decoded. The function also gets called when
* there is format change
*
* @param pVideoFormat - CUVIDEOFORMAT structure automatically fills by cuvidParseVideoData call
*
* @return 1 on success, 0 either
*/
int Decoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat)
{
std::cout << "HandleVideoSequence called" << std::endl;
return 1;
}
/**
* @brief This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from this
* function to decode the picture
*
* @param pictureParams - CUVIDPICPARAMS structure automatically fills by cuvidParseVideoData call
*
* @return 1 on success, 0 either
*/
int Decoder::HandlePictureDecode(CUVIDPICPARAMS *pictureParams)
{
std::cout << "HandlePictureDecode called" << std::endl;
if (!this->decoder) {
throw Exception("Decoder not initialized.", __FUNCTION__, __FILE__, __LINE__);
}
std::cout << "nBitstreamDataLen =" << pictureParams->nBitstreamDataLen << std::endl;
std::cout << "CurrPicIdx =" << pictureParams->CurrPicIdx << std::endl;
std::cout << "PicWidthInMbs =" << pictureParams->PicWidthInMbs << std::endl;
std::cout << "FrameHeightInMbs =" << pictureParams->FrameHeightInMbs << std::endl;
NVDEC_API_CALL(cuvidDecodePicture(this->decoder, pictureParams));
return 1;
}
/**
* @brief This function gets called after a picture is decoded and available for display. Frames are fetched and
* stored in internal buffer
*
* @param pDispInfo - CUVIDPARSERDISPINFO structure automatically fills by cuvidParseVideoData call
*
* @return 1 on success, 0 either
*/
int Decoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo)
{
std::cout << "HandlePictureDisplay called" << std::endl;
CUdeviceptr devicePtr = 0;
unsigned int pitch = 0;
CUVIDPROCPARAMS processingParams = { };
CUVIDGETDECODESTATUS DecodeStatus = { };
memset(&DecodeStatus, 0, sizeof(DecodeStatus));
processingParams.progressive_frame = pDispInfo->progressive_frame;
processingParams.second_field = pDispInfo->repeat_first_field + 1;
processingParams.top_field_first = pDispInfo->top_field_first;
processingParams.unpaired_field = pDispInfo->repeat_first_field < 0;
processingParams.output_stream = this->cudaStream;
NVDEC_API_CALL(cuvidMapVideoFrame(this->decoder, pDispInfo->picture_index, &devicePtr, &pitch, &processingParams));
NVDEC_API_CALL(cuvidGetDecodeStatus(this->decoder, pDispInfo->picture_index, &DecodeStatus));
if (DecodeStatus.decodeStatus == cuvidDecodeStatus_Error || DecodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed) {
throw Exception("Decode error occurred", __FUNCTION__, __FILE__, __LINE__);
}
// @todo is devicePtr planar or not ?
this->cudaFunction(
devicePtr,
pitch,
static_cast<uint>(this->demuxer->getVideoWidth()),
static_cast<uint>(this->demuxer->getVideoHeight())
);
NVDEC_API_CALL(cuvidUnmapVideoFrame(this->decoder, devicePtr));
return 1;
}
} // End of NvDecoder namespace
And after 2 iterations, I got a CUDA_ERROR_INVALID_VALUE on cuvidDecodePicture call.
I put some std::cout for debugin purpose, here is the output I got :
HandleVideoSequence called
HandlePictureDecode called
nBitstreamDataLen =29246
CurrPicIdx =0
PicWidthInMbs =40
FrameHeightInMbs =23
HandlePictureDisplay called
pitch =1024
width =640
height =360
HandlePictureDecode called
nBitstreamDataLen =1721
CurrPicIdx =0
PicWidthInMbs =40
FrameHeightInMbs =23
HandlePictureDecode [CUDA_ERROR_INVALID_VALUE]: "invalid argument" at /home/rom1/Projects/cuda-hardware-video-reader/NVIDIA_CODEC_SDK/Decoder.cpp:346
Do you have any idea what’s going on ?
Thanks