Dear AastaLLL,
Thanks for your answer! Of course I am aware of MMAPI. Let’s go deeper with a working code example I have so far.
The following code gets the Bayer->RGB->YUV420 processed (by Nvidia NPP) image from a camera render loop which can go up to 100 fps 2064x1544 resolution and pushes the images to the HW video encoder.
void H26xEncoder::pushFrame(uint8_t** framePlanes, int* framePlaneSizes, int Planes)
{
if (Initialized == true && framePlanes!=NULL) {
// GPU time 1: Measure time for buffer processing + cudaMemcpy
const int64 start1 = getTickCount();
struct v4l2_buffer v4l2_buf;
struct v4l2_plane planes[MAX_PLANES];
memset(&v4l2_buf, 0, sizeof(v4l2_buf));
memset(planes, 0, MAX_PLANES * sizeof(struct v4l2_plane));
v4l2_buf.m.planes = planes;
// Check if we need dqBuffer first
if (bufferIndex < MAX_ENCODER_FRAMES &&
ctx.enc->output_plane.getNumQueuedBuffers() <
ctx.enc->output_plane.getNumBuffers())
{
// The queue is not full, no need to dqBuffer
// Prepare buffer index for the following qBuffer
printf("bufferIndex: %d\n",bufferIndex);
v4l2_buf.index = bufferIndex++;
NvBufferCreateParams init_params = {0};
init_params.width = 2064;
init_params.height = 1544;
init_params.layout = NvBufferLayout_Pitch;
init_params.colorFormat = NvBufferColorFormat_YUV420;
if (NvBufferCreateEx(&fd, &init_params)==-1) {
printf("Failed to create dma_buf\n");
} else {
printf("fd=%d\n",fd);
}
}
else
{
ctx.enc->output_plane.dqBuffer(v4l2_buf, NULL, NULL, 10); // 10
fd = v4l2_buf.m.planes[0].m.fd;
}
NvBufferParams params;
NvBufferGetParams(fd, ¶ms);
status = cuGraphicsEGLRegisterImage(&resource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD);
if (status != CUDA_SUCCESS) {
printf("cuGraphicsEGLRegisterImage failed: %d.\n", status);
}
status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, resource, 0, 0);
if (status != CUDA_SUCCESS) {
printf("cuGraphicsResourceGetMappedEglFrame failed: %d.\n", status);
}
eglImage = NvEGLImageFromFd (display, fd);
if(eglImage == NULL) {
cout << "create eglImage failed" << endl;
}
int bytes_to_read;
bytes_to_read = params.pitch[0] * params.height[0];
cudaMemcpy((cudaArray_t)eglFrame.frame.pArray[0],framePlanes[0],bytes_to_read,cudaMemcpyDeviceToDevice);
bytes_to_read = params.pitch[1] * params.height[1];
cudaMemcpy((cudaArray_t)eglFrame.frame.pArray[1],framePlanes[1],bytes_to_read,cudaMemcpyDeviceToDevice);
bytes_to_read = params.pitch[2] * params.height[2];
cudaMemcpy((cudaArray_t)eglFrame.frame.pArray[2],framePlanes[2],bytes_to_read,cudaMemcpyDeviceToDevice);
// Push the frame into V4L2.
v4l2_buf.m.planes[0].m.fd = fd;
v4l2_buf.m.planes[0].bytesused = 1; // byteused must be non-zero
ctx.enc->output_plane.qBuffer(v4l2_buf, NULL);
// Measure time until pushing into buffer
const double timeSec1 = (getTickCount() - start1) / getTickFrequency();
cout << "GPU Time 1 : " << timeSec1 * 1000 << " ms" << endl;
// Measure time for resource unregister function
const int64 start = getTickCount();
status = cuGraphicsUnregisterResource(resource);
if (status != CUDA_SUCCESS)
{
printf("cuGraphicsEGLUnRegisterResource failed: %d\n", status);
}
const double timeSec = (getTickCount() - start) / getTickFrequency();
cout << "GPU Time 2 : " << timeSec * 1000 << " ms" << endl;
} // end of if initialized==true
} // end of pushFrame function
Getting the processed image via cudaMemcpy and pushing it into v4l2 buffer takes less than a millisecond but “cuGraphicsUnregisterResource” function takes around 15 ms as shown below.
GPU Time 1 : 0.699259 ms
GPU Time 2 : 15.4409 ms
GPU Time 1 : 0.708219 ms
GPU Time 2 : 16.8548 ms
GPU Time 1 : 0.688091 ms
GPU Time 2 : 15.3245 ms
GPU Time 1 : 0.728859 ms
GPU Time 2 : 14.7805 ms
GPU Time 1 : 0.612476 ms
GPU Time 2 : 17.6632 ms
If I don’t call “cuGraphicsUnregisterResource” function at the end, I get memory leaks and sometimes crashes. I would like to reduce this delay to achieve 100 fps rendering+encoding speed.
I will be happy if you help me to resolve this issue.
Thanks and best regards,
Burak