I work with P4 GPU, referencing the example in Video_Codec_SDK_8.2.16/Samples/AppDecode/AppDec/AppDec.cpp.it takes 2.5 ms to dcode one frame of 1920x1080, and the GPU-Util rises to 100%, takes 5+GB GPU memory when 10 frames decoding at the same time(my input is 10 IP cameras, i create a thread for every one).
thanks!!
the decode code as below,
ck(cuInit(0));
int nGpu = 0;
ck(cuDeviceGetCount(&nGpu));
if (m_gpuId >= nGpu)
m_gpuId = 0;
CUdevice cuDevice = 0;
ck(cuDeviceGet(&cuDevice, m_gpuId));
ck(cuCtxCreate(&m_cuContext, 0, cuDevice));
try {
m_dec = new NvDecoder(m_cuContext, m_demuxer->GetWidth(), m_demuxer->GetHeight(), true, FFmpeg2NvCodecId(m_demuxer->GetVideoCodec()));
} catch (std::exception & e) {
printf("%s, line %d, rtsp %s, err %s\n", __func__, __LINE__, m_rtsp.c_str(), e.what());
ck(cuCtxDestroy(m_cuContext));
m_cuContext = NULL;
return;
}
int nFrame = 0;
uint8_t *pVideo = NULL;
int nVideoBytes = 0;
uint8_t **ppFrame;
int nFrameReturned = 0;
double start_time = cvGetTickCount();
while (1) {
if (m_state > 0)
break;
double time1 = cvGetTickCount();
try {
m_demuxer->Demux(&pVideo, &nVideoBytes);
m_dec->Decode(pVideo, nVideoBytes, &ppFrame, &nFrameReturned);
} catch (std::exception & e) {
printf("%s, line %d, rtsp %s, err %s\n", __func__, __LINE__, m_rtsp.c_str(), e.what());
continue;
}
//printf("decode %f ms, rtsp %s\n", (cvGetTickCount() - time1) / cv_per_count, m_rtsp.c_str());
if (!nFrame && nFrameReturned) {
LOG(INFO) << m_dec->GetVideoInfo();
printf("first frame %f ms\n", (cvGetTickCount() - start_time) / cv_per_count );
start_time = cvGetTickCount();
}
if (m_demuxer->GetWidth() != m_width || m_demuxer->GetHeight() != m_height) {
m_width = m_demuxer->GetWidth();
m_height = m_demuxer->GetHeight();
if (m_width < 1 || m_height < 1)
continue;
if (m_dpFrame)
cuMemFree(m_dpFrame);
ck(cuMemAlloc(&m_dpFrame, m_width * m_height * 4 * sizeof(uint8_t)));
if (m_bgr24)
cuMemFree(m_bgr24);
ck(cuMemAlloc(&m_bgr24, m_width * m_height * 3 * sizeof(uint8_t)));
}
//printf("%s, nFrameReturned %d\n", __func__, nFrameReturned);
nFrame += nFrameReturned;
for (int i = 0; i < nFrameReturned; i++) {
double time2 = cvGetTickCount() / cv_per_count ;
if (m_demuxer->GetBitDepth() == 8) {
Nv12ToBgra32((uint8_t *)ppFrame[i], m_width, (uint8_t *)m_dpFrame, 4 * m_width, m_width, m_height);
Bgra32ToBgr24((uint8_t *)m_dpFrame, (uint8_t *)m_bgr24, m_width, m_height);
} else {
P016ToBgra32((uint8_t *)ppFrame[i], 2 * m_width, (uint8_t *)m_dpFrame, 4 * m_width, m_width, m_height);
}
double time3 = cvGetTickCount() / cv_per_count ;
cv::Mat image(m_height, m_width, CV_8UC3);
double time4 = cvGetTickCount() / cv_per_count ;
ck(cuMemcpyDtoH(image.data, m_bgr24, m_width * m_height * 3));
double time5 = cvGetTickCount() / cv_per_count ;
//printf("index %d, nFrameReturn %d, color cvt %f ms, create image %f ms, memcpy %f ms\n", i, nFrameReturned, time3 - time2, time4 - time3, time5 - time4);
if (m_video_type != VIDEO) {
while (m_queue->full()) {
m_queue->pop();
}
}
m_queue->push_back(image);
}
if (nVideoBytes < 1)
break;
}