CSI camera stream causes kernel panic 3 cameras IMX219

Hello,
I have a python script that uses three gstreamer pipelines in OpenCV, the script runs well, I can capture images and process them but running the script for any extended (few min to few hours) period of time results in a kernel panic. Any idea on how I can fix this?

The SOM is a jetson nano production module, on custom carrier board with three CSI cameras (IMX219)

uname -a output:
Linux probe-v3 4.9.253-tegra #1 SMP PREEMPT Tue Aug 16 12:53:21 EDT 2022 aarch64 aarch64 aarch64 GNU/Linux

debug output:

[  204.635156] Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 0
[  204.642383] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.9.253-tegra #1
[  204.648901] Hardware name: Probe V3 (DT)
[  204.653683] Call trace:
[  204.656132] [<ffffff800808ba40>] dump_backtrace+0x0/0x198
[  204.661525] [<ffffff800808c004>] show_stack+0x24/0x30
[  204.666573] [<ffffff8008f62cfc>] dump_stack+0xa0/0xc4
[  204.671618] [<ffffff8008f5fda0>] panic+0x12c/0x2a8
[  204.676405] [<ffffff80081819b4>] watchdog_check_hardlockup_other_cpu+0x11c/0x120
[  204.683791] [<ffffff8008180b28>] watchdog_timer_fn+0x98/0x2c0
[  204.689531] [<ffffff8008138f30>] __hrtimer_run_queues+0xd8/0x360
[  204.695530] [<ffffff8008139880>] hrtimer_interrupt+0xa8/0x1e0
[  204.701271] [<ffffff8008bfc100>] tegra210_timer_isr+0x38/0x48
[  204.707009] [<ffffff8008121960>] __handle_irq_event_percpu+0x68/0x288
[  204.713440] [<ffffff8008121ba8>] handle_irq_event_percpu+0x28/0x60
[  204.719612] [<ffffff8008121c30>] handle_irq_event+0x50/0x80
[  204.725178] [<ffffff8008125ac4>] handle_fasteoi_irq+0xd4/0x1c0
[  204.731003] [<ffffff8008120914>] generic_handle_irq+0x34/0x50
[  204.736741] [<ffffff8008121000>] __handle_domain_irq+0x68/0xc0
[  204.742564] [<ffffff8008080d44>] gic_handle_irq+0x5c/0xb0
[  204.747960] [<ffffff8008082c28>] el1_irq+0xe8/0x194
[  204.752838] [<ffffff8008ba2300>] cpuidle_enter_state+0xb8/0x380
[  204.758754] [<ffffff8008ba263c>] cpuidle_enter+0x34/0x48
[  204.764062] [<ffffff80081113bc>] call_cpuidle+0x44/0x70
[  204.769283] [<ffffff8008111738>] cpu_startup_entry+0x1b0/0x200
[  204.775114] [<ffffff8008091cf8>] secondary_start_kernel+0x190/0x1f8
[  204.781377] [<0000000084f701a8>] 0x84f701a8
[  204.785560] SMP: stopping secondary CPUs
[  205.913616] SMP: failed to stop secondary CPUs 0-1
[  205.918409] Kernel Offset: disabled
[  205.921897] Memory Limit: none
[  205.934821] Rebooting in 5 seconds..
[  210.938972] SMP: stopping secondary CPUs
[  212.066665] SMP: failed to stop secondary CPUs 0-1

minimum python script to reproduce the issue:

#!/usr/bin/python3

import cv2
import json
import numpy as np
import os
import signal
import sys
import threading
import time
import zmq
from zmq_functions import *

from Focuser import *

import gi
gi.require_version('Gst', '1.0')

from gi.repository import GLib, GObject, Gst, GstBase, GstVideo

Gst.init(None)

from subprocess import check_output

class camera:
    def __init__(self, camNum, config, context):
        self.config = config
        self.camNum = camNum
        self.camDev = config['sensor_id'][camNum]
        self.running = True
        self.thread = None    
        self.image = None
        self.gstbuffer = None
        self.output = None
        self.sink_size = None
        self.lens_bus = config['lens_i2c_bus'][camNum]
        self.lens = None

        self.lock = threading.RLock()

        self.ipc = context.socket(zmq.REP)
        self.ipc.bind(config['ipc_file'][camNum])
        print("ZeroMQ IPC Socket bound.")
        self.poller = zmq.Poller()
        self.poller.register(self.ipc, zmq.POLLIN)

    def open_camera(self):
        pipeline = self.define_pipeline_jetson()
        self.pipeline = Gst.parse_launch(pipeline)
        self.lens = Focuser(self.lens_bus)

        # Set up refferace to the camera so that we can change the image settings, 
        self.mipi_camera = self.pipeline.get_by_name('src_camera')
       
        # Set up appsink for image src for opencv 
        self.appsink = self.pipeline.get_by_name('appsink')
        self.appsink.connect("new-sample", self.on_new_sample)
        self.appsinkValve = self.pipeline.get_by_name('appsinkValve')

        # Set up bus to catch messages
        self.bus = self.pipeline.get_bus()
        self.bus.add_signal_watch()
        self.bus.connect('message', self.on_bus_message)

        print("%s: Camera opened." %(self.camDev))
    
    def define_pipeline_jetson(self):
        pipeline = ('nvarguscamerasrc name=src_camera sensor_id=%d tnr-strength=1 tnr-mode=2 ! ' \
        'video/x-raw(memory:NVMM), width=(int)%d, height=(int)%d, framerate=(fraction)%d/1 ! ' \
        'valve name=appsinkValve drop=false ! nvvidconv ! ' \
        'video/x-raw, ' \
        'format=(string)BGRx ! videoconvert ! video/x-raw, ' \
        'format=(string)BGR ! appsink name=appsink emit-signals=True' %(self.camDev, self.config['frameSize'][0], self.config['frameSize'][1], self.config['frameRate']))

        print('Gstreamer pipeline: %s' %pipeline)
        return(pipeline)

    def setup_camera(self):
        self.open_camera()
        self.resume_pipeline()
        self.stop_app_sink()

    def start_app_sink(self):
        print("Enabling appsink stream.")
        if self.mipi_camera is not None:
            self.appsinkValve.set_property("drop", False)

    def stop_app_sink(self):
        print("Disabling appsink stream.")
        if self.mipi_camera is not None:
            self.appsinkValve.set_property("drop", True)

    def shutdown_pipeline(self):
        self.pipeline.set_state(Gst.State.NULL)

    def resume_pipeline(self):
        print("Resuming pipeline.")
        if self.pipeline is not None:
            self.pipeline.set_state(Gst.State.PLAYING)
            self.pipeline.get_state(Gst.CLOCK_TIME_NONE)

    def on_bus_message(self, bus, message):
        t = message.type
        if t == Gst.MessageType.EOS:
            Gtk.main_quit()
        elif t == Gst.MessageType.WARNING:
            err, debug = message.parse_warning()
            sys.stderr.write('Warning: %s: %s\n' % (err, debug))
        elif t == Gst.MessageType.ERROR:
            err, debug = message.parse_error()
            sys.stderr.write('GST_Error: %s: %s\n' % (err, debug))
            Gtk.main_quit()
        return True

    def on_new_sample(self, sink):
        print("GOT NEW SAMPLE")
        sample = sink.emit('pull-sample')
        caps = sample.get_caps()

        # Extract the width and height info from the sample's caps
        height = caps.get_structure(0).get_value("height")
        width = caps.get_structure(0).get_value("width")

        # Get the actual data
        buffer = sample.get_buffer()
        # Get read access to the buffer data
        success, map_info = buffer.map(Gst.MapFlags.READ)
        if not success:
            raise RuntimeError("Could not map buffer data!")

        numpy_frame = np.ndarray(
            shape=(height, width, 3),
            dtype=np.uint8,
            buffer=map_info.data)

        self.image = numpy_frame

        # Clean up the buffer mapping
        buffer.unmap(map_info)
        return Gst.FlowReturn.OK
    
    def poll_for_reqs(self):
        socks = dict(self.poller.poll(0))
        if self.ipc in socks and socks[self.ipc] == zmq.POLLIN:
            message = self.ipc.recv_json(zmq.NOBLOCK)
            print("%s: GOT MESSAGE %s" %(self.camDev, message['request']))
            if message['request'] == 'take_sample':
                result = self.take_sample(message)
                self.ipc.send_json(result)
            elif message['request'] == 'shutdown':
                self.shutdown_pipeline()
                running = False
                self.ipc.send_json({"success": True, "message": "Done"})
            else:
                self.ipc.send_json({"success": False, "message": "request not valid"})

    def start_thread(self):
        while self.running:
            self.poll_for_reqs()
            time.sleep(.05)
 
    def take_sample(self, message):
        #Set up 
        success = False
        data = None
        start_time = time.time()
        frame = None

        file_name = message['name'] + str(self.camNum) + ".jpg"

      
        print("%s: Running sample image capture." % self.camDev)
        
        w, h = self.config['frameSize']
        self.start_app_sink()
        self.image = None 


        print("%s: waititng for camera to focus: center laplace method" % self.camDev)
        
        s = 150
        focus = 0
        max_focus = 0
        max_laplace = 0

        focus_counter = 0
        for focus in range(0,1000,50):
            self.lens.set(Focuser.OPT_FOCUS,focus)
            frame = self.image
            if frame is not None:
                img = frame[int(h/2-s/2):int(h/2+s/2), int(w/2-s/2):int(w/2+s/2)]
                current_laplace = cv2.Laplacian(img,cv2.CV_16U).var()
                max_laplace, max_focus, focus_counter =  (current_laplace, focus, 0) if current_laplace > max_laplace else (max_laplace, max_focus, focus_counter+1)
                print("%s: focus: %d, laplace: %d" %(self.camDev, focus, current_laplace))
                if focus_counter > 5:
                    break
            else:
                print("Fame is none")

        frame_attempt = 0
        while self.running and frame_attempt <=10:
            frame = self.image
            if frame is not None:
                self.stop_app_sink()
                print("%s: Saving image %s" %(self.camDev, file_name))
                cv2.imwrite(file_name, frame)
                success = True
                break
            else:
                print("%s: Frame is None." %(self.camDev))
                time.sleep(1)
                frame_attempt +=1
                success = False

        print("%s: Done with sample capture: %s s" %(self.camDev, time.time()-start_time))
        return {"success": success, "message": file_name}

    def stop(self):
        self.running = False
        self.shutdown_pipeline()

class fastCam:
    def setUpCams(self, camList, config):
        cameras = []
        for camNum, i in enumerate(camList):
            if i != "":
                cam = camera(camNum, config, self.context)
                is_open = cam.setup_camera()
                if True:
                    cameras.append(cam)
                    self.threads.append(threading.Thread(target=cam.start_thread))
                    self.ipc_sockets.append(zmq_req_rep_socket(self.context, config['ipc_file'][camNum]))  
        for t in self.threads:
            t.start()            
        return(cameras)

    def __init__(self, config, context):
        self.config = config
        self.threads = []
        self.ipc_sockets = []
        self.context = context
        print("Camera Server  : Setting up cameras")
        self.cams = tuple(self.setUpCams(self.config['sensor_id'], self.config))

    def __del__(self):
        try:
            print("Camera Server  : Closing cameras.")
            self.stop()
        except Exception as inst:
            app_log.error(inst)

    def stop(self):
        print("Camera Server  : Stopping all threads")
        for cam in self.cams:
            cam.stop()
        if self.threads is not None:
            for t in self.threads:
                t.join()
        print("All threads have joined")
        self.threads = []

    def take_sample(self):
        message = {
            "request" : "take_sample",
            "name" : "./sample-image-"
        }

        for socket in self.ipc_sockets:
            socket.send(message)
        
        success = []
        data = []
        for socket in self.ipc_sockets:
            _success, _data = socket.receive()
            success.append(_success)
            data.append(_data)

        print("Camera Server  : All camera threads have responded")
        print("Camera Server  : Success: %s" %success)
        print("Camera Server  : Data: %s" %data)

        return success, data

class server:
    def receive_signal(self, signum, stack):
        print("got trigger value: %s" %signum)
        if signum == signal.SIGUSR1:
            self.camObj.take_sample()
  
        elif signum == signal.SIGQUIT:
            self.stop = True
            
    def __init__(self):
        signal.signal(signal.SIGUSR1, self.receive_signal)
        signal.signal(signal.SIGQUIT, self.receive_signal)
        print("Signals registered.")
        self.context = zmq.Context()

        self.stop = False
        self.config = {
            "sensor_id": [
                0,
                1,
                2
            ],
             "lens_i2c_bus": [
                7,
                8,
                9
            ],
            "ipc_file": [
                "ipc:///tmp/camera_ipc_0",
                "ipc:///tmp/camera_ipc_1",
                "ipc:///tmp/camera_ipc_2"
            ],
            "frameSize":[
                3264,
                1848
            ],
            "frameRate": 15
        }
        self.camObj = fastCam(self.config, self.context)

    def run(self):
        while not self.stop:
            time.sleep(.25)

        if self.camObj is not None:
            self.camObj.stop()

if __name__ == "__main__":
    server = server()
    server.run()                     
    print("Camera server is done exiting")

in a separate terminal window:
while true; do sudo pkill -USR1 minimum_camera; sleep 10; done

Hi,
It takes significant CPU usage to use OpenCV, so please run CPU cores at maximum clock and try. Please execute sudo nvpmodel-m 0 and sudo jetson_clocks

Hello DaneLLL

I attempted to follow your suggestion, the problem still persists.
The system experiences a kernel panic randomly, as stated in the initial post.

Since posting last I modified by script to only use opencv for saving the image to disk, now the Laplace calculation is carried out using VPI the CUDA backend.

The attached screen shot shows CPU / MEM utilization during repeat image capture.

As you can see CPU utilization does not exceed 50 percent in worst case. These samples are taken every second so I may be missing a major spike.

After shortening / removing CSI cable extenders. I stopped experiencing the crashes.