Video_Codec_SDK_8.2.16 seems dot not work on CuvidDecoder

I work with P4 GPU, referencing the example in Video_Codec_SDK_8.2.16/Samples/AppDecode/AppDec/AppDec.cpp.it takes 2.5 ms to dcode one frame of 1920x1080, and the GPU-Util rises to 100%, takes 5+GB GPU memory when 10 frames decoding at the same time(my input is 10 IP cameras, i create a thread for every one).

thanks!!

the decode code as below,

ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
if (m_gpuId >= nGpu)
m_gpuId = 0;
CUdevice cuDevice = 0;
ck(cuDeviceGet(&cuDevice, m_gpuId));
ck(cuCtxCreate(&m_cuContext, 0, cuDevice));

try {
	m_dec = new NvDecoder(m_cuContext, m_demuxer->GetWidth(), m_demuxer->GetHeight(), true, FFmpeg2NvCodecId(m_demuxer->GetVideoCodec()));
} catch (std::exception & e) {
	printf("%s, line %d, rtsp %s, err %s\n", __func__, __LINE__, m_rtsp.c_str(), e.what());
	ck(cuCtxDestroy(m_cuContext));
	m_cuContext = NULL;
	return;
}

int nFrame = 0;
uint8_t *pVideo = NULL;
int nVideoBytes = 0;
uint8_t **ppFrame;
int nFrameReturned = 0;
double start_time = cvGetTickCount();
while (1) {
	if (m_state > 0)
		break;
	double time1 = cvGetTickCount();
	try {
		m_demuxer->Demux(&pVideo, &nVideoBytes);
		m_dec->Decode(pVideo, nVideoBytes, &ppFrame, &nFrameReturned);
	} catch (std::exception & e) {
		printf("%s, line %d, rtsp %s, err %s\n", __func__, __LINE__, m_rtsp.c_str(), e.what());
		continue;
	}
	//printf("decode %f ms, rtsp %s\n", (cvGetTickCount() - time1) / cv_per_count, m_rtsp.c_str());
    if (!nFrame && nFrameReturned) {
        LOG(INFO) << m_dec->GetVideoInfo();
		printf("first frame %f ms\n", (cvGetTickCount() - start_time) / cv_per_count );
		start_time = cvGetTickCount();
	}
	if (m_demuxer->GetWidth() != m_width || m_demuxer->GetHeight() != m_height) {
		m_width = m_demuxer->GetWidth();
		m_height = m_demuxer->GetHeight();
		if (m_width < 1 || m_height < 1)
			continue;
		if (m_dpFrame)
			cuMemFree(m_dpFrame);
		ck(cuMemAlloc(&m_dpFrame, m_width * m_height * 4 * sizeof(uint8_t)));
		if (m_bgr24)
			cuMemFree(m_bgr24);
		ck(cuMemAlloc(&m_bgr24, m_width * m_height * 3 * sizeof(uint8_t)));
	}

	//printf("%s, nFrameReturned %d\n", __func__, nFrameReturned);
    nFrame += nFrameReturned;
    for (int i = 0; i < nFrameReturned; i++) {
		double time2 = cvGetTickCount() / cv_per_count ;
        if (m_demuxer->GetBitDepth() == 8) {
            Nv12ToBgra32((uint8_t *)ppFrame[i], m_width, (uint8_t *)m_dpFrame, 4 * m_width, m_width, m_height);
			Bgra32ToBgr24((uint8_t *)m_dpFrame, (uint8_t *)m_bgr24, m_width, m_height);
		} else {
            P016ToBgra32((uint8_t *)ppFrame[i], 2 * m_width, (uint8_t *)m_dpFrame, 4 * m_width, m_width, m_height);
		}

		double time3 = cvGetTickCount() / cv_per_count ;
		cv::Mat image(m_height, m_width, CV_8UC3);
		double time4 = cvGetTickCount() / cv_per_count ;
        ck(cuMemcpyDtoH(image.data, m_bgr24, m_width * m_height * 3));
		double time5 = cvGetTickCount() / cv_per_count ;
		//printf("index %d, nFrameReturn %d, color cvt %f ms, create image %f ms, memcpy %f ms\n", i, nFrameReturned, time3 - time2, time4 - time3, time5 - time4);

		if (m_video_type != VIDEO) {
			while (m_queue->full()) {
				m_queue->pop();
			}
		}
		m_queue->push_back(image);
    }
	if (nVideoBytes < 1)
		break;
}

Hi,
Can you try to remove code ck(cuMemcpyDtoH(image.data, m_bgr24, m_width * m_height * 3))
and to see if “takes 5+GB GPU memory when 10 frames decoding at the same time” still there?
if “takes 5+GB GPU memory when 10 frames decoding at the same time” still there, Can you
give one sample which can build and run for further repro and analysis?

this time, it takes 2285MB GPU memory, whether executing the color space converting code or not.
The gpu utils is 4% if return after m_dec->Decode(pVideo, nVideoBytes, &ppFrame, &nFrameReturned); but increase to 100% when containing Nv12ToBgra32((uint8_t *)ppFrame[i], m_width, (uint8_t *)m_dpFrame, 4 * m_width, m_width, m_height);

Hi
p4 have a dedicated hardware accelarated video decode engine, that’s why you got 4% utility of GPU if return after m_dec->Decode(pVideo, nVideoBytes, &ppFrame, &nFrameReturned); but increase to 100% when containing Nv12ToBgra32((uint8_t *)ppFrame[i], m_width, (uint8_t *)m_dpFrame, 4 * m_width, m_width, m_height);
since color space convert doing in GPU side.

so this should seems normal.
https://images.nvidia.com/content/pdf/tesla/184457-Tesla-P4-Datasheet-NV-Final-Letter-Web.pdf