Problem using NVCUVID library for video decoding

Hi, i’m trying to develop a low latency decoder that use nvcuvid api for transcoding purpose. I try to understand the use of api by studying VideoDecoder open GL sample in CUDA sdk. I develop on windows xp using Microsoft compiler Cl, a Win32 application (not CLR application), and i’ve a NVIDIA 8600GT. I’ve installed cuda 2.2.

In the next code i call a constructor where I create a CUvideodecoder but the function return me as result CUresult code equal 100, CUDA_ERROR_NO_DEVICE, but when before i call cutilDrvGetMaxGflopsDeviceId() the devide seems correctly recognized.

Code of the main.

#define SRC_WIDTH 352

#define SRC_HEIGHT 288

#define CODEC_TYPE cudaVideoCodec_H264

#define CHROMA_FORMAT cudaVideoChromaFormat_420

using namespace std;

VideoDecoder *videoDecoder;

VideoParser *videoParser;

FrameQueue *frameQueue;

CUVIDEOFORMAT   videoFormatInfo;

cudaVideoCreateFlags	videoCreateFlags;

CUcontext			   ctx=0;

CUvideoctxlock		  ctxLock = NULL;

CUdevice		  device  = 0;



	info->codec = CODEC_TYPE;

	info->coded_width = SRC_WIDTH;

	info->coded_height = SRC_HEIGHT;

	info->chroma_format = CHROMA_FORMAT;

	return 1;


int main(int argc,char **argv)



   int i_frame = 0;

   const char *filename="Carlitos_Way_transcoded00.264"; 

	// Initialize CUDA


	// Check for a min spec of Compute 1.1 capability before running

	if (!cutilDrvCudaCapabilities(1,1)) {

		cutilExit(0, NULL);


	CUdevice cuda_device;

	cuda_device = cutilDrvGetMaxGflopsDeviceId();

	cutilDrvSafeCallNoSync(cuDeviceGet(&device, cuda_device ));

	// Create CUDA Device w/ GL interop

	// (use CU_CTX_BLOCKING_SYNC for better CPU synchronization)

	cuGLCtxCreate(&ctx, CU_CTX_BLOCKING_SYNC, device);

	CCtxAutoLock lck(ctxLock);


	videoCreateFlags = cudaVideoCreate_Default;

	memset( &videoFormatInfo, 0, sizeof(CUVIDEOFORMAT));

	setCUVIDEOFORMATINFO( &videoFormatInfo );

	frameQueue = new FrameQueue();

	videoDecoder = new VideoDecoder( videoFormatInfo, ctx, videoCreateFlags, ctxLock );

				//Other code after the error


Code of videoDecoder constructor, where i’ve the error.

VideoDecoder::VideoDecoder(const CUVIDEOFORMAT & rVideoFormat, 

						   CUcontext &rContext, 

						   cudaVideoCreateFlags eCreateFlags, 

						   CUvideoctxlock &ctx) 

	: m_CtxLock(ctx)


	// get a copy of the CUDA context

	m_Context		  = rContext;

	m_VideoCreateFlags = eCreateFlags;

	printf("> VideoDecoder::cudaVideoCreateFlags = <%d>", (int)eCreateFlags);

	switch (eCreateFlags) {

		case cudaVideoCreate_Default:	printf("Default (VP)\n"); break;

		case cudaVideoCreate_PreferCUDA: printf("Use CUDA decoder\n"); break;

		case cudaVideoCreate_PreferDXVA: printf("Use DXVA decoder\n"); break;

		default: printf("Unknown value\n"); break;


			// Validate video format. Currently only a subset is 

			// supported via the cuvid API.

	cudaVideoCodec eCodec = rVideoFormat.codec;

	assert(cudaVideoCodec_MPEG1 == eCodec || cudaVideoCodec_MPEG2 == eCodec || cudaVideoCodec_VC1 == eCodec || cudaVideoCodec_H264 == eCodec);

	assert(cudaVideoChromaFormat_420 == rVideoFormat.chroma_format);

			// Fill the decoder-create-info struct from the given video-format struct.

	memset(&oVideoDecodeCreateInfo_, 0, sizeof(CUVIDDECODECREATEINFO));

			// Create video decoder

	oVideoDecodeCreateInfo_.CodecType		   = rVideoFormat.codec;

	oVideoDecodeCreateInfo_.ulWidth			 = rVideoFormat.coded_width;

	oVideoDecodeCreateInfo_.ulHeight			= rVideoFormat.coded_height;

	oVideoDecodeCreateInfo_.ulNumDecodeSurfaces = FrameQueue::cnMaximumSize;

			// Limit decode memory to 24MB (16M pixels at 4:2:0 = 24M bytes)

	while (oVideoDecodeCreateInfo_.ulNumDecodeSurfaces * rVideoFormat.coded_width * rVideoFormat.coded_height > 16*1024*1024)




	oVideoDecodeCreateInfo_.ChromaFormat		= rVideoFormat.chroma_format;

	oVideoDecodeCreateInfo_.OutputFormat		= cudaVideoSurfaceFormat_NV12;

	oVideoDecodeCreateInfo_.DeinterlaceMode	 = cudaVideoDeinterlaceMode_Adaptive;

			// No scaling

	oVideoDecodeCreateInfo_.ulTargetWidth	   = oVideoDecodeCreateInfo_.ulWidth;

	oVideoDecodeCreateInfo_.ulTargetHeight	  = oVideoDecodeCreateInfo_.ulHeight;

	oVideoDecodeCreateInfo_.ulNumOutputSurfaces = 2;  // We won't simultaneously map more than 2 surfaces

	oVideoDecodeCreateInfo_.ulCreationFlags	 = m_VideoCreateFlags;

	oVideoDecodeCreateInfo_.vidLock			 = ctx;

			// create the decoder

	CUresult oResult = cuvidCreateDecoder(&oDecoder_, &oVideoDecodeCreateInfo_);




	  assert(CUDA_SUCCESS == oResult);


I’ve the error after cuvidCreateDecoder().

What could be the problem?

Many thanks.

Best regards.

If I want only to decode the frame without show it, but i want to transfer the decoded frame in host memory for re-encode it, can I omit the part of open gl? open gl is only for visualization or is needed also for simply decoding? Anybody know documentation or something that explain api?

Sure you can decode the frame excluding the part of open gl/d3d!

Here’s how(I will show the steps of calling functions of CUDA):












I have been studying CUDA for a few days and I have tested these steps, it works!

Hope this will help you.