Performance issue getting CUeglFrame from EGLImageKHR

Hello,

I developed a gstreamer plugin by modifying nvcompositor to process and combine two images.
I mostly modified the code in function do_nvcomposite to replace the call to NvBufferComposite with my own CUDA processing. To do so, inspired by this post, I call, for my 3 images (2 inputs and 1 output):

  • NvEGLImageFromFd to retrieve EGLImageKHR instances
  • then cuGraphicsEGLRegisterImage
  • then cuGraphicsResourceGetMappedEglFrame to get a CUeglFrame instance
  • (apply CUDA processing to combine the 2 inputs into the output)
  • cuGraphicsUnregisterResource
  • then NvDestroyEGLImage

While I try to optimize my CUDA kernels to reduce the processing time, I realize that the only calls to cuGraphicsEGLRegisterImage, cuGraphicsResourceGetMappedEglFrameand cuGraphicsUnregisterResource take 6 to 8 milliseconds per frame (for 3 images), which seems a lot.

Is this something expected, or is there something I should do differently?

In case it helps, here is a typical pipeline to reproduce:

gst-launch-1.0 -e  \
filesrc location=$path1  ! qtdemux ! h264parse ! omxh264dec ! nvvidconv ! video/x-raw\(memory:NVMM\),format=RGBA,width=3024,height=2280 ! mix. \
filesrc location=$path2 ! qtdemux ! h264parse ! omxh264dec ! nvvidconv ! video/x-raw\(memory:NVMM\),format=RGBA,width=3024,height=2280 ! mix. \
my_plugin name=mix ! \
video/x-raw\(memory:NVMM\),format=RGBA,width=6048,height=2280 ! \
fakesink

and a few code extracts:

static gboolean
do_nvcomposite (GstVideoAggregator * vagg, gint out_dmabuf_fd)
{
	// Beginning unchanged
	// ...


	EGLImageKHR image1 = NvEGLImageFromFd(egl_display, input_dmabuf_fds[0]);
	if (image1 == EGL_NO_IMAGE_KHR)
	{
		GST_ERROR("NvEGLImageFromFd failed for image 1");
		return FALSE;
	}

	EGLImageKHR image2 = NvEGLImageFromFd(egl_display, input_dmabuf_fds[1]);
	if (image2 == EGL_NO_IMAGE_KHR)
	{
		GST_ERROR("NvEGLImageFromFd failed for image 2");
		return FALSE;
	}

	EGLImageKHR out_image = NvEGLImageFromFd(egl_display, out_dmabuf_fd);
	if (out_image == EGL_NO_IMAGE_KHR)
	{
		GST_ERROR("NvEGLImageFromFd failed for output image");
		return FALSE;
	}

	// Processing
    if (ProcessFrame(image1, image2, out_image) != CUDA_SUCCESS)
	{
		GST_ERROR("ProcessFrame failed");
	}

	// Release EGL images
	NvDestroyEGLImage(egl_display, image1);
	NvDestroyEGLImage(egl_display, image2);
	NvDestroyEGLImage(egl_display, out_image);

	// end of function unchanged: calls to NvReleaseFd
}

The method ProcessFrame is in a separate .cu file:

CUresult EGLImageToFrame(EGLImageKHR egl_image, CUgraphicsResource* pResource, CUeglFrame* pegl_frame, CUgraphicsMapResourceFlags mapFlags)
{
	CUresult cuResult;

	cuResult = cuGraphicsEGLRegisterImage(pResource, egl_image, mapFlags);
	if (cuResult != CUDA_SUCCESS) {
		HandleError(cuResult, __FILE__, __LINE__, false);
		return cuResult;
	}

	cuResult = HANDLE_ERROR_NO_ABORT(cuGraphicsResourceGetMappedEglFrame(pegl_frame, *pResource, 0, 0));
	if (cuResult != CUDA_SUCCESS) {
		Log::Error("cuGraphicsResourceGetMappedEglFrame failed.");
		return cuResult;
	}

	return cuResult;
}

static bool initialized = false;

DATSTITCH_EXPORT CUresult ProcessFrame(EGLImageKHR image1, EGLImageKHR image2, EGLImageKHR out_image)
{
	auto start = std::chrono::high_resolution_clock::now();
	CUresult res;

	// Get EGL frames from EGLImageKHR images
	cudaFree(0); // Required to use CUDA context in this thread

	CUeglFrame eglFrame1, eglFrame2, eglFrameOut;
	CUgraphicsResource pResource1 = NULL, pResource2 = NULL, pResourceOut = NULL;

	if ((res = EGLImageToFrame(image1, &pResource1, &eglFrame1, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY)) != CUDA_SUCCESS)
	{
		Log::Error("EGLImageToFrame failed for image 1");
		return res;
	}

	if ((res = EGLImageToFrame(image2, &pResource2, &eglFrame2, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY)) != CUDA_SUCCESS)
	{
		Log::Error("EGLImageToFrame failed for image 2");
		return res;
	}

	if ((res = EGLImageToFrame(out_image, &pResourceOut, &eglFrameOut, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE)) != CUDA_SUCCESS)
	{
		Log::Error("EGLImageToFrame failed for output image");
		return res;
	}

	CUresult cuResult = HANDLE_ERROR_NO_ABORT(cuCtxSynchronize());
	if (cuResult != CUDA_SUCCESS) {
		Log::Error("cuCtxSynchronize failed.");
		return cuResult;
	}

	// Initialize if necessary
	if (!initialized)
	{
		// Complete initialization here
		// ...
		initialized = true;
	}

	// Process the frame
	if (eglFrame1.frameType == CU_EGL_FRAME_TYPE_PITCH &&
		eglFrame2.frameType == CU_EGL_FRAME_TYPE_PITCH &&
		eglFrameOut.frameType == CU_EGL_FRAME_TYPE_PITCH)
	{
		if (eglFrame1.eglColorFormat == CU_EGL_COLOR_FORMAT_ABGR &&
			eglFrame2.eglColorFormat == CU_EGL_COLOR_FORMAT_ABGR &&
			eglFrameOut.eglColorFormat == CU_EGL_COLOR_FORMAT_ABGR)
		{
			/* Apply gpu processing here */
			// Even when the processing code is commented out, I still have a total of 6 to 8 ms spent in the ProcessFrame function 
			// ...
		}
		else
			Log::Error("Invalid eglcolorformat");
	}
	else
		Log::Error("Invalid frame type");


	if (cuCtxSynchronize() != CUDA_SUCCESS)
		Log::Error("cuCtxSynchronize failed");

	if (cuGraphicsUnregisterResource(pResource1) != CUDA_SUCCESS)
		Log::Error("cuGraphicsUnregisterResource for image 1 failed");

	if (cuGraphicsUnregisterResource(pResource2) != CUDA_SUCCESS)
		Log::Error("cuGraphicsUnregisterResource for image 2 failed");

	if (cuGraphicsUnregisterResource(pResourceOut) != CUDA_SUCCESS)
		Log::Error("cuGraphicsUnregisterResource for output image failed");

	auto end = std::chrono::high_resolution_clock::now();
	Log::Debug(STR("ProcessFrame done in " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms"));

	return CUDA_SUCCESS;
}

Here is my configuration:

  • Hardware: Jetson Nano Developer Kit, SoC: tegra 210
  • JetPack: 4.6., L4T 32.7.1, Ubuntu 18.04.6 LTS
  • gstreamer version: 1.14.5

Thanks!

Hi,
The function calls are must-have for synchronizing the data. It is an optimal solution and there may not have room for further enhancement. Please note this.

OK, thanks for the straight-forward answer.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.