Please provide complete information as applicable to your setup.
• Hardware Platform (Jetson / GPU) Jetson Orin NX 16 GB
• DeepStream Version 6.3-triton-multiarch docker image
• JetPack Version (valid for Jetson only) 5.1.1
Hello everyone,
I’m developing a gstreamer plugin for stitching 2 frames, I use cuGraphicsEGLRegisterImage to do CUDA operations on NvBufSurface. I use these functions to map 2 input frames and 1 output frame. When I measure times, mapping input frames gets slower after every call while the output frame doesn’t.
After some time, the execution time of the plugin surpasses the input framerate(30 FPS) and causes frame drops.
I added execution time plots and the relevant part of the gstreamer pipeline at this post’s end. Can someone help me with this, please?
Mentioned code block:
clock_gettime(CLOCK_MONOTONIC, &t1);
CHECK_CUDA(cuCtxSynchronize(), "cuCtxSynchronize failed");
/* Map left frame */
if (input_nvbuf_surfs[0]->surfaceList[0].mappedAddr.eglImage == NULL)
NvBufSurfaceMapEglImage(input_nvbuf_surfs[0], 0);
eglimage_src_left = input_nvbuf_surfs[0]->surfaceList[0].mappedAddr.eglImage;
CHECK_CUDA(cuGraphicsEGLRegisterImage(&pResource_left, eglimage_src_left, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE), "cuGraphicsEGLRegisterImage failed");
CHECK_CUDA(cuGraphicsResourceGetMappedEglFrame(&eglFrame_left, pResource_left, 0, 0), "cuGraphicsSubResourceGetMappedArray failed");
clock_gettime(CLOCK_MONOTONIC, &t2);
/* Map right frame */
if (input_nvbuf_surfs[1]->surfaceList[0].mappedAddr.eglImage == NULL)
NvBufSurfaceMapEglImage(input_nvbuf_surfs[1], 0);
eglimage_src_right = input_nvbuf_surfs[1]->surfaceList[0].mappedAddr.eglImage;
CHECK_CUDA(cuGraphicsEGLRegisterImage(&pResource_right, eglimage_src_right, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE), "cuGraphicsEGLRegisterImage failed");
CHECK_CUDA(cuGraphicsResourceGetMappedEglFrame(&eglFrame_right, pResource_right, 0, 0), "cuGraphicsSubResourceGetMappedArray failed");
clock_gettime(CLOCK_MONOTONIC, &t3);
CHECK_CUDA(cuCtxSynchronize(), "cuCtxSynchronize failed");
input_left = cv::cuda::GpuMat(input_nvbuf_surfs[0]->surfaceList[0].height, input_nvbuf_surfs[0]->surfaceList[0].width, CV_8UC4, eglFrame_left.frame.pPitch[0]);
input_right = cv::cuda::GpuMat(input_nvbuf_surfs[1]->surfaceList[0].height, input_nvbuf_surfs[1]->surfaceList[0].width, CV_8UC4, eglFrame_right.frame.pPitch[0]);
/* Some opencv CUDA operations eg. cv::cuda::add(), cv::cuda::multiply() */
/* Sync processing */
CHECK_CUDA(cuCtxSynchronize(), "cuCtxSynchronize failed");
/* Unregister sources */
CHECK_CUDA(cuGraphicsUnregisterResource(pResource_left), "cuGraphicsEGLUnRegisterResource failed");
CHECK_CUDA(cuGraphicsUnregisterResource(pResource_right), "cuGraphicsEGLUnRegisterResource failed");
if (NvBufSurfaceFromFd(outmem->buf->dmabuf_fd, (void **)(&dst_nvbuf_surf)) != 0)
{
GST_ERROR("NvBufSurfaceFromFd failed");
return FALSE;
}
int width = dst_nvbuf_surf->surfaceList[0].width;
int height = dst_nvbuf_surf->surfaceList[0].height;
int pitch = dst_nvbuf_surf->surfaceList[0].pitch;
clock_gettime(CLOCK_MONOTONIC, &t4);
/* Map output frame */
if (dst_nvbuf_surf->surfaceList[0].mappedAddr.eglImage == NULL)
NvBufSurfaceMapEglImage(dst_nvbuf_surf, 0);
eglimage_src_output = dst_nvbuf_surf->surfaceList[0].mappedAddr.eglImage;
CHECK_CUDA(cuGraphicsEGLRegisterImage(&pResource_output, eglimage_src_output, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE), "cuGraphicsEGLRegisterImage failed");
CHECK_CUDA(cuGraphicsResourceGetMappedEglFrame(&eglFrame_output, pResource_output, 0, 0), "cuGraphicsSubResourceGetMappedArray failed");
clock_gettime(CLOCK_MONOTONIC, &t5);
// memcpy ref: https://forums.developer.nvidia.com/t/copy-opencv-gpumat-data-to-an-nvbuffer/189812/10
cudaError_t err = cudaMemcpy2D(
eglFrame_output.frame.pPitch[0],
pitch,
pano_out.ptr<uint8_t>(),
pano_out.step,
width * BGRA_BYTES_PER_PIXEL,
height,
cudaMemcpyDeviceToDevice);
if (err != cudaSuccess)
{
g_printerr("ERROR cudaMemcpy2D: %s\n", cudaGetErrorString(err));
}
/* Sync processing */
CHECK_CUDA(cuCtxSynchronize(), "cuCtxSynchronize failed");
/* Unregister sources */
CHECK_CUDA(cuGraphicsUnregisterResource(pResource_output), "cuGraphicsEGLUnRegisterResource failed");
elapsed_time = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_nsec - t1.tv_nsec) / 1000000.0;
g_print("[stitcher] t1-t2: %f ", elapsed_time);
elapsed_time = (t3.tv_sec - t2.tv_sec) * 1000.0 + (t3.tv_nsec - t2.tv_nsec) / 1000000.0;
g_print("t2-t3: %f ms ", elapsed_time);
elapsed_time = (t5.tv_sec - t4.tv_sec) * 1000.0 + (t5.tv_nsec - t4.tv_nsec) / 1000000.0;
g_print("t4-t5: %f ms\n", elapsed_time);
This is a part of the output of this code, I parse it in Python and plot it.
[stitcher] t1-t2: 1.194188 t2-t3: 1.893768 ms t4-t5: 0.822367 ms
[stitcher] t1-t2: 1.199661 t2-t3: 1.203373 ms t4-t5: 0.803198 ms
[stitcher] t1-t2: 1.392917 t2-t3: 1.647422 ms t4-t5: 0.802334 ms
[stitcher] t1-t2: 1.336371 t2-t3: 1.165868 ms t4-t5: 0.991877 ms
[stitcher] t1-t2: 1.340626 t2-t3: 1.761154 ms t4-t5: 0.851744 ms