NVCSI - Image corruption with L4T r28.2

We are using TX2 on a custom hardware to capture images from a SDI-CSI2 bridge which can receive input from two different cameras. Camera1 uses 4 lanes whereas Camera2 requires 2 lanes.
I have created a new device tree file, added them as two modules (each module has multiple modes) which gives me two video nodes.

With this configuration everything was working as expected on R28.1.

We wanted to use UART7 which fails to load on R28.1 and it is fixed in R28.2 so I have updated to R28.2.

In this release when we capture video using v4l2-ctl, we are seeing 1 or 2 partial/corrupted frames at the beginning. They are mostly at the beginning afaict. The hardware used is same in both cases, only change is the L4T release.

There are no error messages in kernel logs and the trace looks normal as well except for rtos_queue_peek_from_isr_failed msg in between.

kworker/5:1-105   [005] ...1   235.960389: rtos_queue_peek_from_isr_failed: tstamp:7627865816 queue:0x0b4a3c58
     kworker/5:1-105   [005] ...1   235.960466: rtcpu_start: tstamp:7627866702
     kworker/5:1-105   [005] ...1   236.016114: rtcpu_vinotify_handle_msg: tstamp:7629916548 tag:CSIMUX_STREAM channel:0xff frame:0 vi_tstamp:3334948820 data:0x00000001
     kworker/5:1-105   [005] ...1   236.064117: rtcpu_vinotify_handle_msg: tstamp:7630741111 tag:CHANSEL_PXL_SOF channel:0x00 frame:0 vi_tstamp:3335773264 data:0x00000001
     kworker/5:1-105   [005] ...1   236.064118: rtcpu_vinotify_handle_msg: tstamp:7630741292 tag:ATOMP_FS channel:0x00 frame:0 vi_tstamp:3335773269 data:0x00000000
     kworker/5:1-105   [005] ...1   236.064119: rtcpu_vinotify_handle_msg: tstamp:7631082610 tag:CHANSEL_LOAD_FRAMED channel:0x01 frame:0 vi_tstamp:3336114903 data:0x08000000
     kworker/5:1-105   [005] ...1   236.064120: rtcpu_vinotify_handle_msg: tstamp:7631340824 tag:CHANSEL_PXL_EOF channel:0x00 frame:0 vi_tstamp:3336373107 data:0x04370002
     kworker/5:1-105   [005] ...1   236.064121: rtcpu_vinotify_handle_msg: tstamp:7631343176 tag:ATOMP_FE channel:0x00 frame:0 vi_tstamp:3336375421 data:0x00000000
     kworker/5:1-105   [005] ...1   236.064122: rtcpu_vinotify_handle_msg: tstamp:7631366107 tag:CHANSEL_PXL_SOF channel:0x00 frame:0 vi_tstamp:3336398262 data:0x00000001
     kworker/5:1-105   [005] ...1   236.064123: rtcpu_vinotify_handle_msg: tstamp:7631366252 tag:ATOMP_FS channel:0x00 frame:0 vi_tstamp:3336398269 data:0x00000000
     kworker/5:1-105   [005] ...1   236.116119: rtcpu_vinotify_handle_msg: tstamp:7631706554 tag:CHANSEL_LOAD_FRAMED channel:0x01 frame:0 vi_tstamp:3336738849 data:0x08000000
     kworker/5:1-105   [005] ...1   236.116122: rtcpu_vinotify_handle_msg: tstamp:7631965824 tag:CHANSEL_PXL_EOF channel:0x00 frame:0 vi_tstamp:3336998106 data:0x04370002
     kworker/5:1-105   [005] ...1   236.116123: rtcpu_vinotify_handle_msg: tstamp:7631968176 tag:ATOMP_FE channel:0x00 frame:0 vi_tstamp:3337000420 data:0x00000000
     kworker/5:1-105   [005] ...1   236.116124: rtcpu_vinotify_handle_msg: tstamp:7631991090 tag:CHANSEL_PXL_SOF channel:0x00 frame:0 vi_tstamp:3337023261 data:0x00000001
     kworker/5:1-105   [005] ...1   236.116125: rtcpu_vinotify_handle_msg: tstamp:7631991248 tag:ATOMP_FS channel:0x00 frame:0 vi_tstamp:3337023267 data:0x00000000
     kworker/5:1-105   [005] ...1   236.116126: rtcpu_vinotify_handle_msg: tstamp:7632332106 tag:CHANSEL_LOAD_FRAMED channel:0x01 frame:0 vi_tstamp:3337364398 data:0x08000000
     kworker/5:1-105   [005] ...1   236.116126: rtcpu_vinotify_handle_msg: tstamp:7632590823 tag:CHANSEL_PXL_EOF channel:0x00 frame:0 vi_tstamp:3337623105 data:0x04370002
     kworker/5:1-105   [005] ...1   236.116127: rtcpu_vinotify_handle_msg: tstamp:7632593175 tag:ATOMP_FE channel:0x00 frame:0 vi_tstamp:3337625419 data:0x00000000
     kworker/5:1-105   [005] ...1   236.116128: rtcpu_vinotify_handle_msg: tstamp:7632616100 tag:CHANSEL_PXL_SOF channel:0x00 frame:0 vi_tstamp:3337648260 data:0x00000001
     kworker/5:1-105   [005] ...1   236.116129: rtcpu_vinotify_handle_msg: tstamp:7632616247 tag:ATOMP_FS channel:0x00 frame:0 vi_tstamp:3337648266 data:0x00000000
     kworker/5:1-105   [005] ...1   236.116131: rtos_queue_peek_from_isr_failed: tstamp:7632866660 queue:0x0b4a3c58
     kworker/5:1-105   [005] ...1   236.116132: rtcpu_vinotify_handle_msg: tstamp:7632957138 tag:CHANSEL_LOAD_FRAMED channel:0x01 frame:0 vi_tstamp:3337989432 data:0x08000000
     kworker/5:1-105   [005] ...1   236.116133: rtcpu_vinotify_handle_msg: tstamp:7633215819 tag:CHANSEL_PXL_EOF channel:0x00 frame:0 vi_tstamp:3338248103 data:0x04370002
     kworker/5:1-105   [005] ...1   236.116134: rtcpu_vinotify_handle_msg: tstamp:7633218172 tag:ATOMP_FE channel:0x00 frame:0 vi_tstamp:3338250418 data:0x00000000
     kworker/5:1-105   [005] ...1   236.116135: rtcpu_vinotify_handle_msg: tstamp:7633241087 tag:CHANSEL_PXL_SOF channel:0x00 frame:0 vi_tstamp:3338273258 data:0x00000001
     kworker/5:1-105   [005] ...1   236.116136: rtcpu_vinotify_handle_msg: tstamp:7633241252 tag:ATOMP_FS channel:0x00 frame:0 vi_tstamp:3338273265 data:0x00000000

I noticed quite a few things have changed wrt vi and nvcsi files in kernel especially,

  1. attempting to calculate cil_settle time from the device tree.
  2. changes to mipi calibration sequence
  3. accessing of mode properties access (using different indices as opposed to s_data->mode)

I think these changes are not taking all use-cases into consideration (eg if you have multiple modules in dt file as in my case).

I have tried using default values for these timing parameters (which was the case in R28.1) but still seeing the same issue. My understanding is that it is to do with some timing parameters.

Any suggestions on what could be the issue and how to debug further?

@Arun_work
Did you follow the reference sensor driver to modify your sensor driver for r28.2?

@ShaneCCC,

Yes, I am able to stream video as well with R28.2 except that I see corruption in the first 1 or 2 frames which was not the case with previous release R28.1.
I have updated my driver as per the new release.

In the corrupted frames I see valid data in first few lines and rest of it are zeros.

  1. Did you try the discontinuous_clk in DT and make sure the pix_clk_hz is correct.
  2. It’s bayer sensor? Does the gstreamer/argus worked?

Yes, using discontinuous clock and setup pix_clk_hz correctly in device tree.
It is yuv format and I am using v4l2-ctl.

I am not suspecting any setup issues as I am able to capture video.
Only few frames corrupting may be suggesting sampling issue?
In the corrupted frame, width of each line is correct and after few lines whole frame contains zeros.

Does any kernel timeout message while get the corruption data?

No, there are no error/timeout messages in kernel logs.

[   79.680767] ====> tegra_mipi_wait(), CIL_MIPI_CAL_STATUS=0x00318970, lanes=0x300000
[   79.688474] ==> tegra_mipi_wait(), Calibrated CSI_A
[   79.693442] ==> tegra_mipi_wait(), Calibrated CSI_B
[   79.698676] ====> tegra_mipi_wait(), CIL_MIPI_CAL_STATUS=0x00118970, lanes=0x100000
[   79.706363] ==> tegra_mipi_wait(), Calibrated CSI_A
[   79.734237] ==> vi4_channel_start_streaming()
[   79.743960] ==> tegra_channel_capture_setup(), CSI Port=0, No. Lanes=2, w=1280, h=720,format=0xc8, description=YUV 4:2:2 YUYV
[   79.767580] ==> tegra_channel_capture_frame() from Channel=1
[   79.789418] tegra_csi_s_stream(), Turn ON Streaming
[   79.794305] nvcsi 150c0000.nvcsi: tegra_csi_start_streaming(), Channel=1, port_num=0
[   79.807997] ==> vi_notify_wait()
[   79.825897] ==> tegra_channel_capture_frame() from Channel=1
[   79.831582] ==> tegra_channel_surface_setup(), index=0
[   79.836827] ==> vi_notify_wait()
[   79.845932] ==> tegra_channel_capture_frame() from Channel=1
[   79.851616] ==> tegra_channel_surface_setup(), index=0
...
...
[   80.858959] ==> vi_notify_wait()
[   80.868185] ==> tegra_channel_surface_setup(), index=0
[   80.887725] ==> tegra_channel_set_stream(), stream=OFF
[   80.892953] tegra_csi_s_stream(), Turn OFF Streaming

In this case I captured video using 2 lanes and there are no errors in the log.

I have enabled debug in channel.c and captured frame latencies.

[   34.091231] ==> vi4_channel_start_streaming()
[   34.101046] ==> tegra_channel_capture_setup(), CSI Port=0, No. Lanes=2, w=1280, h=720,format=0xc8, description=YUV 4:2:2 YUYV,
[   34.127349] ==> tegra_channel_capture_frame() from Channel=1
[   34.149404] tegra_csi_s_stream(), Turn ON Streaming
[   34.154350] nvcsi 150c0000.nvcsi: tegra_csi_start_streaming(), Channel=1, port_num=0
[   34.196335] nvcsi 150c0000.nvcsi: csi4_phy_config(), phy_type: D-PHY, phy_num=0, csi_port=0x0, cil_a=true, csi_lanes=2
[   34.260651] ==> vi_notify_wait()
[   34.269862] ==> tegra_channel_capture_frame() from Channel=1
[   34.269870] release_buffer(), capture init latency is 178 ms [frame 1, cap_state 1, vbuf->seq 0]
[   34.284296] ==> tegra_channel_surface_setup(), index=0
[   34.289452] ==> vi_notify_wait()
[   34.292695] ==> tegra_channel_capture_frame() from Channel=1
[   34.292699] release_buffer(), capture init latency is 201 ms [frame 2, cap_state 1, vbuf->seq 1]
[   34.307127] ==> tegra_channel_surface_setup(), index=0
[   34.312278] ==> vi_notify_wait()
[   34.329995] ==> tegra_channel_capture_frame() from Channel=1
[   34.330003] release_buffer(), capture init latency is 238 ms [frame 3, cap_state 1, vbuf->seq 2]
[   34.344450] ==> tegra_channel_surface_setup(), index=0
...
...
[   35.816146] ==> vi_notify_wait()
[   35.833272] release_buffer(), capture init latency is 1742 ms [frame 53, cap_state 1, vbuf->seq 52]
[   35.842927] ==> tegra_channel_surface_setup(), index=0
[   35.852566] release_buffer(), capture init latency is 1761 ms [frame 54, cap_state 0, vbuf->seq 53]
[   35.863098] No vi channel is active
[   35.867761] ==> tegra_channel_set_stream(), stream=OFF
[   35.873295] tegra_csi_s_stream(), Turn OFF Streaming

The frame rate expected is 52 fps which matches with the latency of each frame however it appears it is taking quite a bit of time for the initial frame? I understand that in the first frame latency of 178ms there are many functions being called to configure HW before the actual capture starts but could it be that it is the reason for the corruption at the beginning?

The buffer release mechanism seems different in R28.2 as compared to R28.1, I have not looked into the code in detail but,
In R28.1 - a ringbuffer is used to manage buffers
In R28.2 - looks like buffers are managed individually which may be adding additional latency?

Another thing is in R28.1,

// from drivers/media/platform/tegra/camera/vi/channel.c in R28.1
                /*                                                                                                                                                                                                                    
                 * WAR to force buffer state if capture state is not good                                                                                                                                                             
                 * WAR - After sync point timeout or error frame capture                                                                                                                                                              
                 * the second buffer is intermittently frame of zeros                                                                                                                                                                 
                 * with no error status or padding.                                                                                                                                                                                   
                 */
#if 0
                /* This will drop the first two frames. Disable for now. */
                if (chan->capture_state != CAPTURE_GOOD ||
                        chan->released_bufs < 2)
                        chan->buffer_state[chan->free_index] =
                                                VB2_BUF_STATE_ERROR;
#endif

I see similar snippet in R28.2 and it is enabled, does this condition means is it known issue/expected to have corrupted frames at the beginning? In my case there are no timeouts though.

@Arun_work
Could you add 100ms delay in your sensor initial table.

.
.
.
{0x5e10, 0x0c}'
{OV5693_TABLE_WAIT_MS, 100},
{OV5693_TABLE_END, 0000}

Our driver doesn’t perform any register writes, we use custom hardware and a preconfigured CSI stream is connected to TX2. Register programming is performed outside of the driver through SPI.

Most of the functions in the driver are also stubbed out, does it help if I add this delay in ov5693_s_stream() or please suggest any other suitable place.

@Arun_work
That’s fine to add in the ov5693_s_stream()

It seemed to help but not all the time.

I have noticed one other issue and this is observed with and without adding the delay of 100msec.
This was not seen in R28.1.

Some of the lines at the top are not properly getting updated. I can attach a screenshot but there doesn’t seem to be any way of attaching images to this post.

First few lines (about 20 lines from 0-19) data are getting replaced from the next 20 lines (ie., 20-39). Sometimes this band height varies and it shifts from the top but the signature is same. Some of the lines data is getting replaced.

Please let me know if there is any way to attach an image, that would show the problem clearly.

Attached screenshots.

frame_0 - corruption observed in the first 2 frames.
artifact images - images showing the artifact. In the sequence, chart is moving upward. Data from the subsequent lines are getting replaced at the top (eg in artifat_img1, yellow part which is below is shown at the top. The monitor top edge is also skewed).


artifact_img1.jpg

@Arun_work
The 28.2 got a lot of improve for the capture, I think that catch the problem but 28.1 didn’t. I think you may need more delay for your sensor to ready to output normal frame.

@Arun_work
In case you haven’t solved the problem yet.
We encountered similar problems like you did but with a different camera setup. The reason for the corrupted image in our case was a bug in the VI driver in L4T-28.2, causing the video buffer to be returned to the userspace before the write to the buffer has finished. The following patch should solve the problem.

@ShaneCCC
Is there a place where we can file a bug report and attach the following patch as a possible solution?

From f9f40abc6620416ec9ae83fb0eef3710acf5f8d4 Mon Sep 17 00:00:00 2001
From: gana <gana@zhaw.ch>
Date: Fri, 25 May 2018 13:45:43 +0200
Subject: [PATCH] fix vi4, ATOMP_FE syncpt.

Signed-off-by: gana <gana@zhaw.ch>
---
 drivers/media/platform/tegra/camera/vi/vi4_fops.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/media/platform/tegra/camera/vi/vi4_fops.c b/drivers/media/platform/tegra/camera/vi/vi4_fops.c
index 67763dd..be01176 100644
--- a/drivers/media/platform/tegra/camera/vi/vi4_fops.c
+++ b/drivers/media/platform/tegra/camera/vi/vi4_fops.c
@@ -169,10 +169,10 @@ static bool vi4_check_status(struct tegra_channel *chan)
 }
 
 static bool vi_notify_wait(struct tegra_channel *chan,
-		struct timespec *ts)
+		struct timespec *ts, struct tegra_channel_buffer *buf)
 {
 	int i, err;
-	u32 thresh[TEGRA_CSI_BLOCKS], temp;
+	u32 thresh[TEGRA_CSI_BLOCKS];
 
 	/*
 	 * Increment syncpt for ATOMP_FE
@@ -181,7 +181,7 @@ static bool vi_notify_wait(struct tegra_channel *chan,
 	 * even if we are not waiting for ATOMP_FE here
 	 */
 	for (i = 0; i < chan->valid_ports; i++)
-		temp = nvhost_syncpt_incr_max_ext(chan->vi->ndev,
+		buf->thresh[i] = nvhost_syncpt_incr_max_ext(chan->vi->ndev,
 					chan->syncpt[i][FE_SYNCPT_IDX], 1);
 
 	/*
@@ -535,7 +535,7 @@ static int tegra_channel_capture_frame(struct tegra_channel *chan,
 	}
 
 	/* wait for vi notifier events */
-	vi_notify_wait(chan, &ts);
+	vi_notify_wait(chan, &ts, buf);
 	dev_dbg(&chan->video.dev,
 		"%s: vi4 got SOF syncpt buf[%p]\n", __func__, buf);
 
@@ -585,6 +585,9 @@ static void tegra_channel_release_frame(struct tegra_channel *chan,
 	}
 
 	for (index = 0; index < chan->valid_ports; index++) {
+		/* Set timeout to a reasonable value */
+		if (!chan->timeout)
+			chan->timeout = 250;
 		err = nvhost_syncpt_wait_timeout_ext(chan->vi->ndev,
 			chan->syncpt[index][FE_SYNCPT_IDX], buf->thresh[index],
 			chan->timeout, NULL, &ts);
-- 
1.9.1

@kamm
Could you verify those two paths for 28.2 for corruption image. They will deliver in next release.

https://devtalk.nvidia.com/default/topic/1032825/jetson-tx1/camera-outputs-to-1-buffer-only/post/5254771/#5254771