How to avoid stopping capture after showing "tegra194-vi5 15c10000.vi: corr_err: discarding frame 0, flags: 32, err_data 162"

We connect our cameras with FPD-Link and we found the capture may be stopped when debug message showed that like the below messages:

[  141.640765] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 32-003d, corr_err: discarding frame 0, flags: 32, err_data 162
[  143.470167] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 35-003f, corr_err: discarding frame 0, flags: 96, err_data 4194405
[  144.018509] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 35-003f, corr_err: discarding frame 0, flags: 32, err_data 165
[  144.326463] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 35-003f, corr_err: discarding frame 0, flags: 32, err_data 165
[  145.435507] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 35-003d, corr_err: discarding frame 0, flags: 32, err_data 165
[  146.290995] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 35-003d, corr_err: discarding frame 0, flags: 32, err_data 165
[  465.539995] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 34-003f, corr_err: discarding frame 0, flags: 32, err_data 164
[  465.984803] tegra194-vi5 15c10000.vi: video_name: vi-output, tevi-ar0144 34-003f, corr_err: discarding frame 0, flags: 32, err_data 164

When it stopped capturing, we must relaunch GStreamer and make it restart capture images from our cameras.

check the error codes and we guess it captured incomplete data from our cameras and make it stop capturing:

struct capture_status {
	uint8_t src_stream;
	uint8_t virtual_channel;
	uint16_t frame_id;
	uint32_t status;
...
	uint64_t sof_timestamp;
	uint64_t eof_timestamp;
	uint32_t err_data;

/* Channel encountered uncorrectable error and must be reset */
#define CAPTURE_STATUS_FLAG_CHANNEL_IN_ERROR			U32_C(1U << 0)

/*
 * Spurious data was received before frame start.
 * Can be badly corrupted frame or some random bits.
 * This error doesn't have effect on captured frame
 */
#define CAPTURE_STATUS_FLAG_ERROR_CSIMUX_STREAM_SPURIOUS	U32_C(1U << 1)

/*
 * Illegal data packet was encountered and dropped by CSIMUX.
 * This error may have no effect on capture result or trigger other error if
 * frame got corrupted.
 */
#define CAPTURE_STATUS_FLAG_ERROR_CSIMUX_FIFO_BADPKT		U32_C(1U << 2)

#define CAPTURE_STATUS_FLAG_ERROR_CSIMUX_FRAME_FORCE_FE		U32_C(1U << 3)
#define CAPTURE_STATUS_FLAG_ERROR_CSIMUX_FRAME_ECC_SINGLE_BIT_ERR	U32_C(1U << 4)
#define CAPTURE_STATUS_FLAG_ERROR_CSIMUX_FRAME				U32_C(1U << 5)
#define CAPTURE_STATUS_FLAG_ERROR_CSIMUX_FRAME_CSI_FAULT		U32_C(1U << 6)

/*
 * One or more frames could not be matched and got lost before captured
 * frame.
 * This error doesn't have effect on captured frame
 */
#define CAPTURE_STATUS_FLAG_ERROR_CHANSEL_NO_MATCH		U32_C(1U << 7)

/* Frame not finished */
#define CAPTURE_STATUS_FLAG_ERROR_ATOMP_FRAME_TRUNCATED		U32_C(1U << 8)

/* Frame data not written */
#define CAPTURE_STATUS_FLAG_ERROR_ATOMP_FRAME_TOSSED		U32_C(1U << 9)

	uint32_t flags;
} __CAPTURE_IVC_ALIGN;

We modified ‘goto done’ in vi5_fpos.c and tried to ignore the error codes like below:

static void vi5_capture_dequeue(struct tegra_channel *chan,
	struct tegra_channel_buffer *buf)
{
	int err = 0;
	int vi_port = 0;
	int gang_prev_frame_id = 0;
	unsigned long flags;
	struct tegra_mc_vi *vi = chan->vi;
	struct vb2_v4l2_buffer *vb = &buf->buf;
	struct timespec ts;
	struct capture_descriptor *descr = NULL;

	for(vi_port = 0; vi_port < chan->valid_ports; vi_port++) {
		descr = &chan->request[vi_port][buf->capture_descr_index[vi_port]];

		if (buf->vb2_state != VB2_BUF_STATE_ACTIVE)
			goto rel_buf;

		/* Dequeue a frame and check its capture status */
		err = vi_capture_status(chan->tegra_vi_channel[vi_port], CAPTURE_TIMEOUT_MS);
		if (err) {
			if (err == -ETIMEDOUT) {
				dev_err(vi->dev,
					"video_name: %s, uncorr_err: request timed out after %d ms\n",
					chan->video->name, CAPTURE_TIMEOUT_MS);
			} else {
				dev_err(vi->dev, "uncorr_err: request err %d\n", err);
			}
			goto uncorr_err;
		} else if (descr->status.status != CAPTURE_STATUS_SUCCESS) {
			if ((descr->status.flags
					& CAPTURE_STATUS_FLAG_CHANNEL_IN_ERROR) != 0) {
				chan->queue_error = true;
				dev_err(vi->dev, "uncorr_err: flags %d, err_data %d\n",
					descr->status.flags, descr->status.err_data);
			} else {
				dev_warn(vi->dev,
					"video_name: %s, corr_err: discarding frame %d, flags: %d, "
					"err_data %d\n",
					chan->video->name, descr->status.frame_id, descr->status.flags,
					descr->status.err_data);
				buf->vb2_state = VB2_BUF_STATE_REQUEUEING;
				//goto done;
			}
		} else if (!vi_port) {
			gang_prev_frame_id = descr->status.frame_id;
		} else if (descr->status.frame_id != gang_prev_frame_id) {
			dev_err(vi->dev, "frame_id out of sync: ch2 %d vs ch1 %d\n",
					gang_prev_frame_id, descr->status.frame_id);
			goto uncorr_err;
		}

		spin_lock_irqsave(&chan->capture_state_lock, flags);
		if (chan->capture_state != CAPTURE_ERROR) {
			chan->capture_reqs_enqueued -= 1;
			chan->capture_state = CAPTURE_GOOD;
		}
		spin_unlock_irqrestore(&chan->capture_state_lock, flags);
	}

	wake_up_interruptible(&chan->start_wait);
	/* Read SOF from capture descriptor */
	ts = ns_to_timespec((s64)descr->status.sof_timestamp);
	trace_tegra_channel_capture_frame("sof", ts);
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)
	/* update time stamp of the buffer */
	vb->timestamp.tv_sec = ts.tv_sec;
	vb->timestamp.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
#else
	vb->vb2_buf.timestamp = descr->status.sof_timestamp;
#endif

	buf->vb2_state = VB2_BUF_STATE_DONE;
	/* Read EOF from capture descriptor */
	ts = ns_to_timespec((s64)descr->status.eof_timestamp);
	trace_tegra_channel_capture_frame("eof", ts);

//done:
	goto rel_buf;

uncorr_err:
	spin_lock_irqsave(&chan->capture_state_lock, flags);
	chan->capture_state = CAPTURE_ERROR;
	spin_unlock_irqrestore(&chan->capture_state_lock, flags);

	buf->vb2_state = VB2_BUF_STATE_ERROR;

rel_buf:
	vi5_release_buffer(chan, buf);
}

We found it actually improved the appeared frequency of the problem but it still may cause stop capturing we cannot relaunch Gstreamer!

We would like to resolve the problem of stopping capture and make it keeps working even if it captures errors. How could we avoid stopping capture?

Did you boost the clocks to try.

sudo nvpmodel -m 2
sudo jetson_clocks
sudo su
echo 1 > /sys/kernel/debug/bpmp/debug/clk/vi/mrq_rate_locked
echo 1 > /sys/kernel/debug/bpmp/debug/clk/isp/mrq_rate_locked
echo 1 > /sys/kernel/debug/bpmp/debug/clk/nvcsi/mrq_rate_locked
echo 1 > /sys/kernel/debug/bpmp/debug/clk/emc/mrq_rate_locked
cat /sys/kernel/debug/bpmp/debug/clk/vi/max_rate |tee /sys/kernel/debug/bpmp/debug/clk/vi/rate
cat /sys/kernel/debug/bpmp/debug/clk/isp/max_rate | tee  /sys/kernel/debug/bpmp/debug/clk/isp/rate
cat /sys/kernel/debug/bpmp/debug/clk/nvcsi/max_rate | tee /sys/kernel/debug/bpmp/debug/clk/nvcsi/rate
cat /sys/kernel/debug/bpmp/debug/clk/emc/max_rate | tee /sys/kernel/debug/bpmp/debug/clk/emc/rate

Hi ShaneCCC,

Yes, I had already tried them but it still caused stopping capture.

root@ubuntu-desktop:/home/ubuntu# nvpmodel -q
NV Fan Mode:quiet
NV Power Mode: MODE_15W_6CORE
2

root@ubuntu-desktop:/home/ubuntu# jetson_clocks
root@ubuntu-desktop:/home/ubuntu# jetson_clocks --show
SOC family:tegra194  Machine:NVIDIA Jetson Xavier NX Developer Kit
Online CPUs: 0-5
cpu0: Online=1 Governor=schedutil MinFreq=1420800 MaxFreq=1420800 CurrentFreq=1420800 IdleStates: C1=0 c6=0
cpu1: Online=1 Governor=schedutil MinFreq=1420800 MaxFreq=1420800 CurrentFreq=1420800 IdleStates: C1=0 c6=0
cpu2: Online=1 Governor=schedutil MinFreq=1420800 MaxFreq=1420800 CurrentFreq=1420800 IdleStates: C1=0 c6=0
cpu3: Online=1 Governor=schedutil MinFreq=1420800 MaxFreq=1420800 CurrentFreq=1420800 IdleStates: C1=0 c6=0
cpu4: Online=1 Governor=schedutil MinFreq=1420800 MaxFreq=1420800 CurrentFreq=1420800 IdleStates: C1=0 c6=0
cpu5: Online=1 Governor=schedutil MinFreq=1420800 MaxFreq=1420800 CurrentFreq=1420800 IdleStates: C1=0 c6=0
GPU MinFreq=1109250000 MaxFreq=1109250000 CurrentFreq=1109250000
EMC MinFreq=204000000 MaxFreq=1600000000 CurrentFreq=1866000000 FreqOverride=1
Fan: PWM=0
NV Power Mode: MODE_15W_6CORE

root@ubuntu-desktop:/home/ubuntu# cat /sys/kernel/debug/bpmp/debug/clk/vi/max_rate |tee /sys/kernel/debug/bpmp/debug/clk/vi/rate
460800000
root@ubuntu-desktop:/home/ubuntu# cat /sys/kernel/debug/bpmp/debug/clk/isp/max_rate | tee  /sys/kernel/debug/bpmp/debug/clk/isp/rate
576000000
root@ubuntu-desktop:/home/ubuntu# cat /sys/kernel/debug/bpmp/debug/clk/nvcsi/max_rate | tee /sys/kernel/debug/bpmp/debug/clk/nvcsi/rate
314000000
root@ubuntu-desktop:/home/ubuntu# cat /sys/kernel/debug/bpmp/debug/clk/emc/max_rate | tee /sys/kernel/debug/bpmp/debug/clk/emc/rate
1866000000

Maybe check the trace log if any clues for it.

https://elinux.org/Jetson/l4t/Camera_BringUp

Hi ShancCCC,

I’m sorry maybe my question is confusing. Our cameras are unstable so they may lose data while transfering. We used the unstable camera on the other platforms and they can keep working and not stop capturing even if the camera lost data.

However we used the unstable cameras on Xavier NX, it will stop capturing if the camera lost data.

We want to modify the capture driver and make it can keep working even if it gets incomplete mipi data.

Does the default NVCSI/VI driver didn’t work? Suppose the NVCSI/VI driver should recovery it while capture failed.

I think NVCSI/VI did keep working. Because I can relaunch Gstreamer and it can work again after I found it stop capturing.

the driver actually recovered it but it still may stop capturing after recovering for a while.

I would suggest to fine tune the settle time to make it more stable to avoid recovery again and again.

Thanks