NVDEC hardware CUDA_ERROR_INVALID_VALUE cuvidDecodePicture call

Hello,

I’m trying to develop an hardware decoder to get a CUDA context directly after the video stream decoding and avoiding CPU <=> GPU transfert time.
I am using Codec_SDK 9.0.20 and I built my C++ class based on samples/NvDecoder/NvDecoder.cpp.

main.cpp

#include <stdlib.h>
#include "FFMPEG/Demuxer.hpp"
#include "NVIDIA_CODEC_SDK/Decoder.hpp"
#include "CUDA/Grayscale.hpp"

int main(int argc, char **argv)
{
    try {
        uint8_t *compressedVideoFrame;
        int     compressedVideoFrameSize,
                n = 0;
        auto    demuxer = std::make_shared<FFMPEG::Demuxer>(argv[1]);

        NvDecoder::Decoder decoder(demuxer);
        
        decoder.setCudaFunction(Grayscale::run);

        do {
            demuxer->demux(compressedVideoFrame, compressedVideoFrameSize);
            decoder.decode(compressedVideoFrame, compressedVideoFrameSize, n++);
        } while (compressedVideoFrameSize > 0);
    } catch (const std::exception &e) {
        std::cerr << e.what() << std::endl;
        
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}

Decoder.cpp

#include "Decoder.hpp"

namespace NvDecoder {
    Decoder::Decoder(const DemuxerSharedPtr &demuxer): demuxer(demuxer)
    {
        this->decoderCapacities = { };
        this->codec             = ffmpegToNvCodecId(this->demuxer->getCodecId());

        NVDEC_API_CALL(cuInit(0));
        NVDEC_API_CALL(cuCtxCreate(&this->cudaContext, 0, 0));

        this->retrievesDecoderCapacities();
        this->fillDecoderCreateInfo();
        this->createVideoParser();

        NVDEC_API_CALL(cuvidCreateDecoder(&this->decoder, &this->decoderCreateInfo));
        NVDEC_API_CALL(cuStreamCreate(&this->cudaStream, CU_STREAM_DEFAULT));
    }

    /**
     * @brief Destructor that free the resources
     */
    Decoder::~Decoder()
    {
        cuCtxDestroy(this->cudaContext);

        if (this->parser) {
            cuvidDestroyVideoParser(this->parser);
        }

        if (this->decoder) {
            cuvidDestroyDecoder(this->decoder);
        }
    }

    void Decoder::setCudaFunction(const cudaProcess &cudaFunction)
    {
        Decoder::cudaFunction = cudaFunction;
    }

    /**
     * @brief Get the compressed video frame data from the demuxer and decode it with NVIDIA hardware GPU
     *
     * @param compressedVideoFrame - The demuxer compressed video frame data
     * @param compressedVideoFrameSize - The video frame size in bytes
     * @param timestamp - Timestamp to mark the frame (@todo verify this parameter later)
     */
    void Decoder::decode(const uint8_t *compressedVideoFrame, int compressedVideoFrameSize, int64_t timestamp)
    {
        CUVIDSOURCEDATAPACKET packet = {0};

        packet.payload      = compressedVideoFrame;
        packet.payload_size = static_cast<unsigned long>(compressedVideoFrameSize);
        packet.flags        = CUVID_PKT_TIMESTAMP;
        packet.timestamp    = timestamp;

        if (!compressedVideoFrame || compressedVideoFrameSize == 0) {
            packet.flags |= CUVID_PKT_ENDOFSTREAM;
        }

        // @todo mutex ?
        //Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket
        NVDEC_API_CALL(cuvidParseVideoData(this->parser, &packet));
        // => Calls back HandlePictureDecode with CUVIDPICPARAMS data for kicking of HW decoding
        // => Calls back HandleVideoSequence with CUVIDEOFORMAT data for initial sequence header or when the decoder encounters a video format change
        // => Calls back HandlePictureDisplay with CUVIDPARSERDISPINFO data to display a video frame
    }

    /**
     * @brief Retrieves the decoder capacities from FFMPEG codec info and throws exception if the data stream is not
     *        supported.
     */
    void Decoder::retrievesDecoderCapacities()
    {
        std::ostringstream errorString;

        this->decoderCapacities.eCodecType      = this->codec;
        this->decoderCapacities.eChromaFormat   = this->ffmpegToNvSubsampling(this->demuxer->getSubsampling());
        this->decoderCapacities.nBitDepthMinus8 = static_cast<uint>(this->demuxer->getBitDepth() - 8);

        NVDEC_API_CALL(cuvidGetDecoderCaps(&this->decoderCapacities));

        if (!this->decoderCapacities.bIsSupported) {
            errorString << std::endl << "Codec [" << getCodecName(this->codec) << "] not supported on this GPU";

            throw Exception(errorString.str(), __FUNCTION__, __FILE__, __LINE__);
        }

        if (this->demuxer->getVideoWidth() > this->decoderCapacities.nMaxWidth ||
            this->demuxer->getVideoHeight() > this->decoderCapacities.nMaxHeight
        ) {
            errorString << std::endl
                        << "Resolution          : " << this->demuxer->getVideoWidth() << "x" << this->demuxer->getVideoHeight() << std::endl
                        << "Max supported (wxh) : " << this->decoderCapacities.nMaxWidth << "x" << this->decoderCapacities.nMaxHeight << std::endl
                        << "Resolution not supported on this GPU";

            throw Exception(errorString.str(), __FUNCTION__, __FILE__, __LINE__);
        }

        if ((this->demuxer->getVideoWidth() >> 4) * (this->demuxer->getVideoHeight() >> 4) > this->decoderCapacities.nMaxMBCount) {
            errorString << std::endl
                        << "Macroblock count               : " << (this->demuxer->getVideoWidth() >> 4) * (this->demuxer->getVideoHeight() >> 4) << std::endl
                        << "Max Supported macroblock count : " <<  this->decoderCapacities.nMaxMBCount << std::endl
                        << "Macroblock count not supported on this GPU";

            throw Exception(errorString.str(), __FUNCTION__, __FILE__, __LINE__);
        }
    }

    /**
     * @brief Create the decoder info object with some tuned params
     */
    void Decoder::fillDecoderCreateInfo()
    {
        this->decoderCreateInfo = { };

        this->decoderCreateInfo.CodecType         = this->codec;
        this->decoderCreateInfo.ChromaFormat      = this->decoderCapacities.eChromaFormat;
        this->decoderCreateInfo.bitDepthMinus8    = this->decoderCapacities.nBitDepthMinus8;
        this->decoderCreateInfo.ulWidth           = static_cast<unsigned long>(this->demuxer->getVideoWidth());
        this->decoderCreateInfo.ulHeight          = static_cast<unsigned long>(this->demuxer->getVideoHeight());
        this->decoderCreateInfo.ulTargetWidth     = static_cast<unsigned long>(this->demuxer->getVideoWidth());
        this->decoderCreateInfo.ulTargetHeight    = static_cast<unsigned long>(this->demuxer->getVideoHeight());
        this->decoderCreateInfo.DeinterlaceMode   = cudaVideoDeinterlaceMode_Weave;
        this->decoderCreateInfo.ulIntraDecodeOnly = 1; // Control for memory optimization for I/IDR frame
        // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware
        this->decoderCreateInfo.ulCreationFlags = cudaVideoCreate_Default;
        // This is the number of surfaces that the client will use for storing the decoded frames. Using a higher
        // number ensures better pipelining but increases GPU memory consumption. The driver internally allocates the
        // corresponding number of surfaces. The NVDEC engine outputs decoded data to one of these surfaces.
        this->decoderCreateInfo.ulNumDecodeSurfaces = 5;
        // This is the maximum number of surfaces that the client will simultaneously map for further processing.
        // The driver internally allocates the corresponding number of surfaces. Please refer to section 4.4 to
        // understand the definition of map.
        this->decoderCreateInfo.ulNumOutputSurfaces = 4;

        if (this->decoderCapacities.eChromaFormat == cudaVideoChromaFormat::cudaVideoChromaFormat_420) {
            this->decoderCreateInfo.OutputFormat = this->decoderCapacities.nBitDepthMinus8 == 0
                ? cudaVideoSurfaceFormat_NV12 : cudaVideoSurfaceFormat_P016;
        } else if (this->decoderCapacities.eChromaFormat == cudaVideoChromaFormat::cudaVideoChromaFormat_444) {
            this->decoderCreateInfo.OutputFormat = this->decoderCapacities.nBitDepthMinus8 == 0
                ? cudaVideoSurfaceFormat_YUV444 : cudaVideoSurfaceFormat_YUV444_16Bit;
        } else {
            std::cout << "WARNING: cudaVideoChromaFormat not handle in fillDecoderCreateInfo" << std::endl;
        }
    }

    /**
     * @brief Create the video parser instance
     */
    void Decoder::createVideoParser()
    {
        CUVIDPARSERPARAMS videoParserParameters = { };

        videoParserParameters.CodecType              = this->codec;
        videoParserParameters.ulMaxNumDecodeSurfaces = 1;
        videoParserParameters.ulMaxDisplayDelay      = 1; // 0 or 1, 2 , 3, 4
        videoParserParameters.pUserData              = this;
        videoParserParameters.pfnSequenceCallback    = HandleVideoSequenceProc;
        videoParserParameters.pfnDecodePicture       = HandlePictureDecodeProc;
        videoParserParameters.pfnDisplayPicture      = HandlePictureDisplayProc;

        NVDEC_API_CALL(cuvidCreateVideoParser(&this->parser, &videoParserParameters));
    }

    /**
     * @brief Convert a FFMPEG codec ID into a NVIDIA codec ID
     *
     * @param ffmpegCodecId - The FFMPEG codec ID
     *
     * @return The NVIDIA codec ID
     */
    cudaVideoCodec Decoder::ffmpegToNvCodecId(AVCodecID ffmpegCodecId) const
    {
        switch (ffmpegCodecId) {
            case AV_CODEC_ID_MPEG1VIDEO : return cudaVideoCodec_MPEG1;
            case AV_CODEC_ID_MPEG2VIDEO : return cudaVideoCodec_MPEG2;
            case AV_CODEC_ID_MPEG4      : return cudaVideoCodec_MPEG4;
            case AV_CODEC_ID_VC1        : return cudaVideoCodec_VC1;
            case AV_CODEC_ID_H264       : return cudaVideoCodec_H264;
            case AV_CODEC_ID_HEVC       : return cudaVideoCodec_HEVC;
            case AV_CODEC_ID_VP8        : return cudaVideoCodec_VP8;
            case AV_CODEC_ID_VP9        : return cudaVideoCodec_VP9;
            case AV_CODEC_ID_MJPEG      : return cudaVideoCodec_JPEG;
            default                     : return cudaVideoCodec_NumCodecs;
        }
    }

    /**
     * @brief Convert a FFMPEG subsampling type to NVIDIA CODEC subsampling type
     *
     * @param subsamplingType - The FFMPEG subsampling type
     *
     * @return The NVIDIA CODEC subsampling type
     */
    cudaVideoChromaFormat Decoder::ffmpegToNvSubsampling(FFMPEG::Demuxer::SubsamplingType subsamplingType) const
    {
        cudaVideoChromaFormat cudaSubsamplingType;

        switch (subsamplingType) {
            case FFMPEG::Demuxer::SubsamplingType::monochrome:
                cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_Monochrome;
                break;

            case FFMPEG::Demuxer::SubsamplingType::format420:
                cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_420;
                break;

            case FFMPEG::Demuxer::SubsamplingType::format422:
                cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_422;
                break;

            case FFMPEG::Demuxer::SubsamplingType::format444:
                cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_444;
                break;

            default:
                cudaSubsamplingType = cudaVideoChromaFormat::cudaVideoChromaFormat_420;
        }

        return cudaSubsamplingType;
    }

    /**
     * @brief Get the codec name from the given codec num
     *
     * @param eCodec - NVIDIA codec num
     *
     * @return The codec name
     */
    std::string Decoder::getCodecName(cudaVideoCodec eCodec) const
    {
        static struct {
            cudaVideoCodec eCodec;
            std::string name;
        } aCodecName [] = {
                { cudaVideoCodec_MPEG1,     "MPEG-1"       },
                { cudaVideoCodec_MPEG2,     "MPEG-2"       },
                { cudaVideoCodec_MPEG4,     "MPEG-4 (ASP)" },
                { cudaVideoCodec_VC1,       "VC-1/WMV"     },
                { cudaVideoCodec_H264,      "AVC/H.264"    },
                { cudaVideoCodec_JPEG,      "M-JPEG"       },
                { cudaVideoCodec_H264_SVC,  "H.264/SVC"    },
                { cudaVideoCodec_H264_MVC,  "H.264/MVC"    },
                { cudaVideoCodec_HEVC,      "H.265/HEVC"   },
                { cudaVideoCodec_VP8,       "VP8"          },
                { cudaVideoCodec_VP9,       "VP9"          },
                { cudaVideoCodec_NumCodecs, "Invalid"      },
                { cudaVideoCodec_YUV420,    "YUV  4:2:0"   },
                { cudaVideoCodec_YV12,      "YV12 4:2:0"   },
                { cudaVideoCodec_NV12,      "NV12 4:2:0"   },
                { cudaVideoCodec_YUYV,      "YUYV 4:2:2"   },
                { cudaVideoCodec_UYVY,      "UYVY 4:2:2"   },
        };

        if (eCodec >= 0 && eCodec <= cudaVideoCodec_NumCodecs) {
            return aCodecName[eCodec].name;
        }

        for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(aCodecName) / sizeof(aCodecName[0]); i++) {
            if (eCodec == aCodecName[i].eCodec) {
                return aCodecName[eCodec].name;
            }
        }

        return "Unknown";
    }

    // Callbacks

    /**
     * @brief Callback function to be registered for getting a callback when decoding of sequence starts
     *
     * @param pUserData - abstract pointer to link to the actual callback
     * @param pVideoFormat - CUVIDEOFORMAT structure automatically fills by cuvidParseVideoData call
     *
     * @return 1 on success, 0 either
     */
    int CUDAAPI Decoder::HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat)
    {
        return ((NvDecoder::Decoder *)pUserData)->HandleVideoSequence(pVideoFormat);
    }

    /**
     * @brief Callback function to be registered for getting a callback when a decoded frame is ready to be decoded
     *
     * @param pUserData - abstract pointer to link to the actual callback
     * @param pictureParams - CUVIDPICPARAMS structure automatically fills by cuvidParseVideoData call
     *
     * @return 1 on success, 0 either
     */
    int CUDAAPI Decoder::HandlePictureDecodeProc(void *pUserData, CUVIDPICPARAMS *pictureParams)
    {
        return ((NvDecoder::Decoder *)pUserData)->HandlePictureDecode(pictureParams);
    }

    /**
     * @brief Callback function to be registered for getting a callback when a decoded frame is available for display
     *
     * @param pUserData - abstract pointer to link to the actual callback
     * @param pDispInfo - CUVIDPARSERDISPINFO structure automatically fills by cuvidParseVideoData call
     *
     * @return 1 on success, 0 either
     */
    int CUDAAPI Decoder::HandlePictureDisplayProc(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo)
    {
        return ((NvDecoder::Decoder *)pUserData)->HandlePictureDisplay(pDispInfo);
    }

    /**
     * @brief This function gets called when a sequence is ready to be decoded. The function also gets called when
     *        there is format change
     *
     * @param pVideoFormat - CUVIDEOFORMAT structure automatically fills by cuvidParseVideoData call
     *
     * @return 1 on success, 0 either
     */
    int Decoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat)
    {
        std::cout << "HandleVideoSequence called" << std::endl;

        return 1;
    }

    /**
     * @brief This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from this
     *        function to decode the picture
     *
     * @param pictureParams - CUVIDPICPARAMS structure automatically fills by cuvidParseVideoData call
     *
     * @return 1 on success, 0 either
     */
    int Decoder::HandlePictureDecode(CUVIDPICPARAMS *pictureParams)
    {
        std::cout << "HandlePictureDecode called" << std::endl;

        if (!this->decoder) {
            throw Exception("Decoder not initialized.", __FUNCTION__, __FILE__, __LINE__);
        }

        std::cout << "nBitstreamDataLen =" << pictureParams->nBitstreamDataLen << std::endl;
        std::cout << "CurrPicIdx =" << pictureParams->CurrPicIdx << std::endl;
        std::cout << "PicWidthInMbs =" << pictureParams->PicWidthInMbs << std::endl;
        std::cout << "FrameHeightInMbs =" << pictureParams->FrameHeightInMbs << std::endl;

        NVDEC_API_CALL(cuvidDecodePicture(this->decoder, pictureParams));

        return 1;
    }

    /**
     * @brief This function gets called after a picture is decoded and available for display. Frames are fetched and
     *        stored in internal buffer
     *
     * @param pDispInfo - CUVIDPARSERDISPINFO structure automatically fills by cuvidParseVideoData call
     *
     * @return 1 on success, 0 either
     */
    int Decoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo)
    {
        std::cout << "HandlePictureDisplay called" << std::endl;

        CUdeviceptr          devicePtr        = 0;
        unsigned int         pitch            = 0;
        CUVIDPROCPARAMS      processingParams = { };
        CUVIDGETDECODESTATUS DecodeStatus     = { };

        memset(&DecodeStatus, 0, sizeof(DecodeStatus));

        processingParams.progressive_frame = pDispInfo->progressive_frame;
        processingParams.second_field      = pDispInfo->repeat_first_field + 1;
        processingParams.top_field_first   = pDispInfo->top_field_first;
        processingParams.unpaired_field    = pDispInfo->repeat_first_field < 0;
        processingParams.output_stream     = this->cudaStream;

        NVDEC_API_CALL(cuvidMapVideoFrame(this->decoder, pDispInfo->picture_index, &devicePtr, &pitch, &processingParams));
        NVDEC_API_CALL(cuvidGetDecodeStatus(this->decoder, pDispInfo->picture_index, &DecodeStatus));

        if (DecodeStatus.decodeStatus == cuvidDecodeStatus_Error || DecodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed) {
            throw Exception("Decode error occurred", __FUNCTION__, __FILE__, __LINE__);
        }

        // @todo is devicePtr planar or not ?

        this->cudaFunction(
            devicePtr,
            pitch,
            static_cast<uint>(this->demuxer->getVideoWidth()),
            static_cast<uint>(this->demuxer->getVideoHeight())
        );

        NVDEC_API_CALL(cuvidUnmapVideoFrame(this->decoder, devicePtr));

        return 1;
    }
} // End of NvDecoder namespace

And after 2 iterations, I got a CUDA_ERROR_INVALID_VALUE on cuvidDecodePicture call.

I put some std::cout for debugin purpose, here is the output I got :

HandleVideoSequence called
HandlePictureDecode called
nBitstreamDataLen =29246
CurrPicIdx =0
PicWidthInMbs =40
FrameHeightInMbs =23
HandlePictureDisplay called
pitch =1024
width =640
height =360
HandlePictureDecode called
nBitstreamDataLen =1721
CurrPicIdx =0
PicWidthInMbs =40
FrameHeightInMbs =23
HandlePictureDecode [CUDA_ERROR_INVALID_VALUE]: "invalid argument" at /home/rom1/Projects/cuda-hardware-video-reader/NVIDIA_CODEC_SDK/Decoder.cpp:346

Do you have any idea what’s going on ?

Thanks