Hello JerryChang,
Thanks for your answer.
are these two cameras have hardware synchronization pin?
it looks you’re having left/right frame combination use-case, is it out-of-sync scenario? so that it may buffer waiting for left (or right) frames has arrived then rendering output frame.
No, they don’t. However, if I replace my_plugin
with nvcompositor
(the code of which I used to create my_plugin
), I don’t experience such issue.
you may narrow down the issue by using fpsdisplaysink
to report the frame-rate, to confirm which plugin cause the delay.
Thanks for the tip! The weird thing is that, even though my FPS remains good (>26 fps, which should visually be OK), it looks like my_plugin
receives several time the same images in input, thus generating several times the same output image, even though its FPS looks correct.
I could reproduce the problem with a basic CUDA kernel:
__global__ void cuda_process_kernel(
unsigned char* pimg1Data, int32_t img1Width, int32_t img1Height, int32_t img1Pitch,
unsigned char* pimg2Data, int32_t img2Width, int32_t img2Height, int32_t img2Pitch,
unsigned char* pOutData, int32_t outWidth, int32_t outHeight, int32_t outPitch)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= outWidth || y >= outHeight)
return;
// copy image 1 left-hand side to output image left-hand side
//if (x > img1Width / 3 && x < 2*img1Width/3 && y > img1Height/3 && y < 2* img1Height / 3) // Runs OK when I handle just a small part of the image
if (x < outWidth / 2) // lags begin to appear when I uncomment this line and comment the previous one, although I still have 30fps.
{
int xSrc = x * img1Width / outWidth;
int ySrc = y * img1Height / outHeight;
pOutData[y * outPitch + x * 4 + 0] = pimg1Data[ySrc * img1Pitch + xSrc * 4 + 0];
pOutData[y * outPitch + x * 4 + 1] = pimg1Data[ySrc * img1Pitch + xSrc * 4 + 1];
pOutData[y * outPitch + x * 4 + 2] = pimg1Data[ySrc * img1Pitch + xSrc * 4 + 2];
pOutData[y * outPitch + x * 4 + 3] = pimg1Data[ySrc * img1Pitch + xSrc * 4 + 3];
}
// copy image 2 right-hand side to output image right-hand side --> it is really worse if I uncomment this block
/* else
{
int xSrc = x * img2Width / outWidth;
int ySrc = y * img2Height / outHeight;
pOutData[y * outPitch + x * 4 + 0] = pimg2Data[ySrc * img2Pitch + xSrc * 4 + 0];
pOutData[y * outPitch + x * 4 + 1] = pimg2Data[ySrc * img2Pitch + xSrc * 4 + 1];
pOutData[y * outPitch + x * 4 + 2] = pimg2Data[ySrc * img2Pitch + xSrc * 4 + 2];
pOutData[y * outPitch + x * 4 + 3] = pimg2Data[ySrc * img2Pitch + xSrc * 4 + 3];
}*/
}
This kernel is launched in function ProcessFrame
as follows:
CUresult EGLImageToFrame(EGLImageKHR egl_image, CUgraphicsResource* pResource, CUeglFrame* pegl_frame, CUgraphicsMapResourceFlags mapFlags)
{
CUresult cuResult;
cuResult = cuGraphicsEGLRegisterImage(pResource, egl_image, mapFlags);
if (cuResult != CUDA_SUCCESS) {
//g_printerr("cuGraphicsEGLRegisterImage failed: %d: %s\n", cuResult, cudaGetErrorString(cuResult));
HandleError(cuResult, __FILE__, __LINE__, false);
return cuResult;
}
cuResult = HANDLE_ERROR_NO_ABORT(cuGraphicsResourceGetMappedEglFrame(pegl_frame, *pResource, 0, 0));
if (cuResult != CUDA_SUCCESS) {
g_printerr("cuGraphicsResourceGetMappedEglFrame failed.\n");
return cuResult;
}
return cuResult;
}
CUresult ProcessFrame(gint dmabuf_fd1, gint dmabuf_fd2, gint out_dmabuf_fd)
{
auto start = std::chrono::high_resolution_clock::now();
CUresult res;
// Get EGL images from dmabuf fds
EGLImageKHR image1 = NvEGLImageFromFd(egl_display, dmabuf_fd1);
if (image1 == EGL_NO_IMAGE_KHR)
{
g_printerr("NvEGLImageFromFd failed for image 1\n");
return CUDA_ERROR_UNKNOWN;
}
EGLImageKHR image2 = NvEGLImageFromFd(egl_display, dmabuf_fd2);
if (image2 == EGL_NO_IMAGE_KHR)
{
g_printerr("NvEGLImageFromFd failed for image 2\n");
return CUDA_ERROR_UNKNOWN;
}
EGLImageKHR out_image = NvEGLImageFromFd(egl_display, out_dmabuf_fd);
if (out_image == EGL_NO_IMAGE_KHR)
{
g_printerr("NvEGLImageFromFd failed for output image\n");
return CUDA_ERROR_UNKNOWN;
}
// Get EGL frames from EGLImageKHR images
cudaFree(0); // Required to use CUDA context in this thread
CUeglFrame eglFrame1, eglFrame2, eglFrameOut;
CUgraphicsResource pResource1 = NULL, pResource2 = NULL, pResourceOut = NULL;
if ((res = EGLImageToFrame(image1, &pResource1, &eglFrame1, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY)) != CUDA_SUCCESS)
{
g_printerr("EGLImageToFrame failed for image 1\n");
return res;
}
if ((res = EGLImageToFrame(image2, &pResource2, &eglFrame2, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY)) != CUDA_SUCCESS)
{
g_printerr("EGLImageToFrame failed for image 2\n");
return res;
}
if ((res = EGLImageToFrame(out_image, &pResourceOut, &eglFrameOut, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD)) != CUDA_SUCCESS)
{
g_printerr("EGLImageToFrame failed for output image\n");
return res;
}
CUresult cuResult = HANDLE_ERROR_NO_ABORT(cuCtxSynchronize());
if (cuResult != CUDA_SUCCESS) {
g_printerr("cuCtxSynchronize failed.\n");
return cuResult;
}
// Check that all input frames are of the same size
if (eglFrame1.width != eglFrame2.width || eglFrame1.height != eglFrame2.height)
{
g_printerr("Input images have different sizes: %d*%d vs %d*%d\n", eglFrame1.width, eglFrame1.height, eglFrame2.width, eglFrame2.height);
return CUDA_ERROR_INVALID_VALUE;
}
// Process the frame
if (eglFrame1.frameType == CU_EGL_FRAME_TYPE_PITCH &&
eglFrame2.frameType == CU_EGL_FRAME_TYPE_PITCH &&
eglFrameOut.frameType == CU_EGL_FRAME_TYPE_PITCH)
{
if (eglFrame1.eglColorFormat == CU_EGL_COLOR_FORMAT_ABGR &&
eglFrame2.eglColorFormat == CU_EGL_COLOR_FORMAT_ABGR &&
eglFrameOut.eglColorFormat == CU_EGL_COLOR_FORMAT_ABGR)
{
/* Launch CUDA kernel */
// Define the execution configuration
int outWidth = eglFrameOut.width;
int outHeight = eglFrameOut.height;
dim3 blockDim(16, 16);
dim3 gridDim((outWidth + blockDim.x - 1) / blockDim.x, (outHeight + blockDim.y - 1) / blockDim.y);
// Launch the kernel
cuda_process_kernel << <gridDim, blockDim >> > (
(unsigned char*)eglFrame1.frame.pPitch[0], eglFrame1.width, eglFrame1.height, eglFrame1.pitch,
(unsigned char*)eglFrame2.frame.pPitch[0], eglFrame2.width, eglFrame2.height, eglFrame2.pitch,
(unsigned char*)eglFrameOut.frame.pPitch[0], eglFrameOut.width, eglFrameOut.height, eglFrameOut.pitch);
}
else
g_printerr("Invalid eglcolorformat\n");
}
else
g_printerr("Invalid frame type\n");
if (cuCtxSynchronize() != CUDA_SUCCESS)
g_printerr("cuCtxSynchronize failed\n");
if (cuGraphicsUnregisterResource(pResource1) != CUDA_SUCCESS)
g_printerr("cuGraphicsUnregisterResource for image 1 failed\n");
if (cuGraphicsUnregisterResource(pResource2) != CUDA_SUCCESS)
g_printerr("cuGraphicsUnregisterResource for image 2 failed\n");
if (cuGraphicsUnregisterResource(pResourceOut) != CUDA_SUCCESS)
g_printerr("cuGraphicsUnregisterResource for output image failed\n");
// Release EGL images
NvDestroyEGLImage(egl_display, image1);
NvDestroyEGLImage(egl_display, image2);
NvDestroyEGLImage(egl_display, out_image);
// Measure time
// ... (reported times and fps match the ones reported by `fpsdisplaysink`)
return CUDA_SUCCESS;
}
It runs at 30 fps with the cameras, yet there are visible lags.
Can there be something wrong in my CUDA kernel, or in the way I launch it?