Hello,
I am encountering a significant performance regression when migrating a GStreamer-based application from JetPack 5.1.3 to JetPack 6.2 on NVIDIA Jetson hardware, and I am seeking guidance on what has changed in JetPack 6.x or how this pipeline should be adapted.
The application:
-
Ingests two 1920×1080 camera streams via
v4l2src -
Stitches them into a 3840×1080 panoramic space
-
Displays only one screen-width (1920×1080) at a time
-
Allows interactive panning by dynamically updating
xposon compositor sink pads -
Performs reliably with low latency and modest CPU usage on JetPack 5.1.3
The panning behavior is implemented entirely by updating compositor sink pad properties (xpos) at runtime.
Observed Behavior
JetPack 5.1.3
-
Very low end-to-end latency
-
CPU usage remains low and stable
-
Smooth, responsive panning
-
Reliable long-term operation
JetPack 6.2
-
Noticeable and increasing latency
-
Significantly higher CPU utilization
-
System appears stressed despite identical application logic
-
Overall responsiveness is much worse
The application code and camera configuration are unchanged between versions. CPU usage differences were observed using standard system monitoring tools while running identical workloads.
#include <gst/gst.h>
#include <unistd.h>
#include <termios.h>
#include <fcntl.h>
#include <stdio.h>
static int pan_offset = 0; // 0 .. cam_width
static const int cam_width = 1920;
static const int cam_height = 1080;
// non-blocking keyboard
int kbhit() {
struct termios oldt, newt;
int ch;
int oldf;
tcgetattr(STDIN_FILENO, &oldt);
newt = oldt;
newt.c_lflag &= ~(ICANON | ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &newt);
oldf = fcntl(STDIN_FILENO, F_GETFL, 0);
fcntl(STDIN_FILENO, F_SETFL, oldf | O_NONBLOCK);
ch = getchar();
tcsetattr(STDIN_FILENO, TCSANOW, &oldt);
fcntl(STDIN_FILENO, F_SETFL, oldf);
if (ch != EOF) {
ungetc(ch, stdin);
return 1;
}
return 0;
}
int getch() {
struct termios oldt, newt;
int ch;
tcgetattr(STDIN_FILENO, &oldt);
newt = oldt;
newt.c_lflag &= ~(ICANON | ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &newt);
ch = getchar();
tcsetattr(STDIN_FILENO, TCSANOW, &oldt);
return ch;
}
static gboolean animate(gpointer data) {
GstElement *compositor = GST_ELEMENT(data);
GstPad *pad1 = gst_element_get_static_pad(compositor, "sink_0");
GstPad *pad2 = gst_element_get_static_pad(compositor, "sink_1");
if (!pad1 || !pad2) return TRUE;
// keyboard input
if (kbhit()) {
int c = getch();
if (c == 'a' || c == 'A') pan_offset -= 40;
if (c == 'd' || c == 'D') pan_offset += 40;
if (c == 27 && getch() == '[') {
switch (getch()) {
case 'D': pan_offset -= 40; break;
case 'C': pan_offset += 40; break;
}
}
}
if (pan_offset < 0) pan_offset = 0;
if (pan_offset > cam_width) pan_offset = cam_width;
/*
True panoramic pan:
cam1 moves left
cam2 follows it without overlap
*/
g_object_set(pad1,
"xpos", -pan_offset,
"ypos", 0,
"width", cam_width,
"height", cam_height,
NULL);
g_object_set(pad2,
"xpos", cam_width - pan_offset,
"ypos", 0,
"width", cam_width,
"height", cam_height,
NULL);
gst_object_unref(pad1);
gst_object_unref(pad2);
return TRUE;
}
int main(int argc, char *argv[]) {
gst_init(&argc, &argv);
GstElement *pipeline = gst_pipeline_new("pan-pipeline");
GstElement *src1 = gst_element_factory_make("v4l2src", "cam1");
GstElement *src2 = gst_element_factory_make("v4l2src", "cam2");
GstElement *conv1 = gst_element_factory_make("videoconvert", NULL);
GstElement *conv2 = gst_element_factory_make("videoconvert", NULL);
GstElement *comp = gst_element_factory_make("compositor", "comp");
GstElement *capsf = gst_element_factory_make("capsfilter", NULL);
GstElement *conv3 = gst_element_factory_make("videoconvert", NULL);
GstElement *sink = gst_element_factory_make("autovideosink", NULL);
if (!pipeline || !src1 || !src2 || !conv1 || !conv2 || !comp || !capsf || !conv3 || !sink) {
g_printerr("Failed to create elements\n");
return -1;
}
g_object_set(src1, "device", "/dev/video2", NULL);
g_object_set(src2, "device", "/dev/video0", NULL);
GstCaps *outcaps = gst_caps_new_simple("video/x-raw",
"width", G_TYPE_INT, cam_width,
"height", G_TYPE_INT, cam_height,
NULL);
g_object_set(capsf, "caps", outcaps, NULL);
gst_caps_unref(outcaps);
gst_bin_add_many(GST_BIN(pipeline),
src1, conv1,
src2, conv2,
comp, capsf, conv3, sink, NULL);
gst_element_link(src1, conv1);
gst_element_link(src2, conv2);
gst_element_link_many(comp, capsf, conv3, sink, NULL);
GstPad *sinkpad1 = gst_element_get_request_pad(comp, "sink_%u");
GstPad *sinkpad2 = gst_element_get_request_pad(comp, "sink_%u");
GstPad *srcpad1 = gst_element_get_static_pad(conv1, "src");
GstPad *srcpad2 = gst_element_get_static_pad(conv2, "src");
gst_pad_link(srcpad1, sinkpad1);
gst_pad_link(srcpad2, sinkpad2);
gst_object_unref(srcpad1);
gst_object_unref(srcpad2);
// initial state: only left cam visible
g_object_set(sinkpad1,
"xpos", 0,
"ypos", 0,
"width", cam_width,
"height", cam_height,
NULL);
g_object_set(sinkpad2,
"xpos", cam_width,
"ypos", 0,
"width", cam_width,
"height", cam_height,
NULL);
gst_element_set_state(pipeline, GST_STATE_PLAYING);
GMainLoop *loop = g_main_loop_new(NULL, FALSE);
g_timeout_add(30, animate, comp);
printf("Controls:\n");
printf(" A / Left Arrow = Pan Left\n");
printf(" D / Right Arrow = Pan Right\n");
g_main_loop_run(loop);
gst_element_set_state(pipeline, GST_STATE_NULL);
gst_object_unref(pipeline);
return 0;
}
To isolate the issue, I constructed the following gst-launch pipeline, which achieves the same functional behavior using NVIDIA-accelerated elements:
gst-launch-1.0 \
v4l2src device=/dev/video2 ! \
image/jpeg,width=1920,height=1080,framerate=30/1 ! \
nvv4l2decoder mjpeg=1 ! \
nvvidconv ! video/x-raw(memory:NVMM) ! comp.sink_0 \
v4l2src device=/dev/video0 ! \
image/jpeg,width=1920,height=1080,framerate=30/1 ! \
nvv4l2decoder mjpeg=1 ! \
nvvidconv ! video/x-raw(memory:NVMM) ! comp.sink_1 \
nvcompositor name=comp \
sink_0::xpos=0 sink_0::ypos=0 \
sink_1::xpos=1920 sink_1::ypos=0 \
! video/x-raw(memory:NVMM),width=1920,height=1080 \
! nvvidconv ! nveglglessink sync=false
Result
- Low latency
- Minimal CPU usage
- Stable behavior on JetPack 6.2
This confirms that hardware decode, NVMM zero-copy, and GPU-based compositing are functioning correctly on JetPack 6.2 when using gst-launch.
I have attempted to convert the C++ application to use the NVIDIA-accelerated equivalents (nvv4l2decoder, nvvidconv, nvcompositor, nveglglessink) to match the working gst-launch pipeline.
However, I have encountered difficulties preserving the interactive panning behavior:
- Dynamic updates to
xposonnvcompositorsink pads do not behave as expected - In some cases, panning stops working or triggers renegotiation
- Achieving the same low-latency, GPU-only behavior in C++ has proven difficult despite matching elements and caps
This contrasts with the gst-launch pipeline, where nvcompositor performs well and remains GPU-accelerated.
- Have there been architectural or behavioral changes to
compositorin JetPack 6.x that could explain the latency and CPU increase? - Is
compositornow strictly CPU-based, and if so, is its use discouraged in JetPack 6.x? - Is
nvcompositorthe recommended replacement for low-latency interactive panning, and are there specific constraints when dynamically updatingxpos? - Are there required caps, memory types (NVMM), or properties that must be explicitly enforced in C++ to avoid renegotiation or CPU fallback?
- Is dynamically changing
xposon compositor sink pads still considered a supported low-latency approach in JetPack 6, or is there a newer recommended method?
My goal is to restore JetPack 5.1.3–level latency and CPU efficiency on JetPack 6.2 while maintaining interactive panning behavior in a C++ GStreamer application.
Any guidance, best practices, or references to JetPack 6 migration notes would be greatly appreciated.
Thank you for your time and support.
Best regards,
Gabriel