• Hardware Platform (Jetson / GPU) : Jetson Xavier NX
• DeepStream Version 6.3
• JetPack Version (valid for Jetson only) 5.3
• TensorRT Version 8.5.2
• CUDA Version 11.4
• My goal is to process frames entirely on the GPU (zero-copy) using CUDA.
• Pipeline nvv4l2camerasrc device=/dev/video0 name=mysource ! video/x-raw(memory:NVMM), width=1920, height=1020, framerate=30/1 ! nvvidconv ! video/x-raw(memory:NVMM), format=RGBA ! tee name=t t. ! queue leaky=2 max-size-buffers=10 ! nvvidconv name=myconv ! video/x-raw(memory:NVMM),format=RGBA ! appsink name=sink emit-signals=true max-buffers=1 drop=true
In my C++ code, I’m pulling the sample like this
• Approach 1
GstElement *sink = gst_bin_get_by_name(GST_BIN(gst_pipeline), "sink");
tee_element = gst_bin_get_by_name(GST_BIN(gst_pipeline), "t"); // "t" is your tee name
if (!tee_element)
{
std::cerr << "Tee element not found in pipeline!" << std::endl;
return -1;
}
if (error)
g_error_free(error);
// Start pipeline
gst_element_set_state((GstElement *)gst_pipeline, GST_STATE_PLAYING);
// Wait for pipeline to start
GstStateChangeReturn ret = gst_element_get_state(GST_ELEMENT(gst_pipeline),
NULL, NULL, GST_CLOCK_TIME_NONE);
if (ret == GST_STATE_CHANGE_FAILURE) {
std::cerr << "Failed to start pipeline" << std::endl;
return -1;
}
while (true)
{
GstSample *sample = gst_app_sink_pull_sample(GST_APP_SINK(sink));
if (!sample) break;
GstBuffer *buffer = gst_sample_get_buffer(sample);
GstMapInfo map = {0};
gst_buffer_map(buffer, &map, GST_MAP_READ);
NvBufSurface *surface = (NvBufSurface *)map.data;
NvBufSurfaceMapEglImage(surface, 0);
CUresult status1;
CUeglFrame eglFrame;
CUgraphicsResource pResource = NULL;
cudaFree(0);
if (!surface->surfaceList[0].mappedAddr.eglImage)
{
std::cerr << "EGL Image is null. Skipping frame." << std::endl;
return -1;
}
status1 = cuGraphicsEGLRegisterImage(&pResource,
surface->surfaceList[0].mappedAddr.eglImage,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
if (status1 != CUDA_SUCCESS)
{
printf("cuGraphicsEGLRegisterImage failed: %d \n", status1);
}
status1 = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
status1 = cuCtxSynchronize();
if (undistort_frames)
{
inf.undistort(reinterpret_cast<uchar4 *>(eglFrame.frame.pPitch[0]), rectifiedimg, WIDTH, HEIGHT);
}
...... SOME CUDA OPERATIONS
......
inf.runInference(imgfinal, resize_mem, nchw, tmp, frame_number, mode, pitch, roll);
gst_buffer_unmap(buffer, &map);
gst_sample_unref(sample);
status1 = cuCtxSynchronize();
status1 = cuGraphicsUnregisterResource(pResource);
NvBufSurfaceUnMapEglImage(surface, 0);
}
• Approach 2
static GstPadProbeReturn
conv_src_pad_buffer_probe(GstPad *pad, GstPadProbeInfo *info,
gpointer u_data)
{
// Early exit if stopping - return REMOVE to stop being called
if (stopSignal.load()) {
std::cout << "Probe: Stop signal received, removing probe\n";
return GST_PAD_PROBE_REMOVE;
}
GstBuffer *buffer = (GstBuffer *)info->data;
GstMapInfo map = {0};
gst_buffer_map(buffer, &map, GST_MAP_READ);
NvBufSurface *surface = (NvBufSurface *)map.data;
// CUDA postprocess
{
AppContext *ctx = (AppContext *)u_data;
NvBufSurfaceMapEglImage(surface, 0);
CUresult status;
CUeglFrame eglFrame;
CUgraphicsResource pResource = NULL;
cudaFree(0);
if (!surface->surfaceList[0].mappedAddr.eglImage)
{
std::cerr << "EGL Image is null. Skipping frame." << std::endl;
return GST_PAD_PROBE_DROP;
}
status = cuGraphicsEGLRegisterImage(&pResource,
surface->surfaceList[0].mappedAddr.eglImage,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
if (status != CUDA_SUCCESS)
{
printf("cuGraphicsEGLRegisterImage failed: %d \n", status);
}
status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
status = cuCtxSynchronize();
if (ctx->undistort_frames)
{
ctx->inference->undistort(reinterpret_cast<uchar4 *>(eglFrame.frame.pPitch[0]), ctx->rectifiedimg, ctx->width, ctx->height);
}
...... SOME CUDA OPERATIONS
......
ctx->inference->runInference(ctx->imgfinal, ctx->resize_mem, ctx->nchw, ctx->tmp, ctx->frame_number, ctx->mode, ctx->pitch, ctx->roll);
status = cuCtxSynchronize();
status = cuGraphicsUnregisterResource(pResource);
NvBufSurfaceUnMapEglImage(surface, 0);
}
gst_buffer_unmap(buffer, &map);
return GST_PAD_PROBE_OK;
}
int main()
{
...
tee_element = gst_bin_get_by_name(GST_BIN(gst_pipeline), "t"); // "t" is your tee name
if (!tee_element)
{
std::cerr << "Tee element not found in pipeline!" << std::endl;
return -1;
}
if (error)
g_error_free(error);
GstElement *conv = gst_bin_get_by_name(GST_BIN(gst_pipeline), "myconv");
GstPad *src_pad = gst_element_get_static_pad(conv, "src");
pad_probe_id = gst_pad_add_probe(src_pad, GST_PAD_PROBE_TYPE_BUFFER, conv_src_pad_buffer_probe, ctx, NULL);
gst_object_unref(src_pad);
// Setup main loop and bus watch
main_loop = g_main_loop_new(NULL, FALSE);
ctx->main_loop2 = main_loop;
// Start pipeline
gst_element_set_state((GstElement *)gst_pipeline, GST_STATE_PLAYING);
// Wait for pipeline to start
GstStateChangeReturn ret = gst_element_get_state(GST_ELEMENT(gst_pipeline),
NULL, NULL, GST_CLOCK_TIME_NONE);
if (ret == GST_STATE_CHANGE_FAILURE) {
std::cerr << "Failed to start pipeline" << std::endl;
return -1;
}
g_main_loop_run(main_loop);
// Shutdown
std::cout << "Graceful shutdown started..." << std::endl;
My questions:
- Am I correctly achieving zero-copy GPU access using this method? ( I think i am not)
- Is there a better or more reliable way to ensure that the buffer never touches CPU memory?
- I need to fetch the pointer like shown below but via gstreamer directly { i am trying to add pipelines dynamically which jetson utils dont support so i started with deepstream components }
Jetson Utils Snippet : I want to do this but directly via GStreamer / Deepstream SDK
#include <jetson-utils/videoSource.h>
#include <jetson-utils/cudaMappedMemory.h>
#include <jetson-utils/cudaNormalize.h>
#include <jetson-utils/cudaUtility.h>
#include <jetson-utils/cudaWarp.h>
#include <jetson-utils/cudaColorspace.h>
#include <jetson-utils/videoOptions.h>
int main(){
cudaSetDevice(0);
videoOptions opts;
std::string video_name(argv[1]);
if (video_name == "rear_cam")
{
char *fullpath = realpath("/dev/rear_cam", NULL);
std::string resolvedStr(fullpath);
free(fullpath);
opts.resource = "v4l2://" + resolvedStr;
}
opts.width = 1920;
opts.height = 1024;
// Set framerate
opts.frameRate = 30;
// Set number of ring buffers
opts.numBuffers = 8;
// Enable zero-copy (recommended for Jetson)
opts.zeroCopy = true;
videoSource *video = videoSource::Create(opts);
if (!video)
{
printf("Failed to create video source.\n");
return -1;
}
while (true)
{
if (!video->Capture(&img, 1000))
{ // 1000 = timeout in milliseconds
printf("No frame received, exiting...\n");
break;
.... SEND TO INFERENCE >>>
.....
}
}
I am a newbie and not that great with this unified memory stuff :(
Thanks for your time if you have read till here,
Best,
Mohit