Deadlock in gst_element_release_request_pad with nvstreammux

Hi,
Please share a patch in this format so that we can apply it through ‘$ patch -p1 < test.patch’. Not familiar with this format. Sorry for this.
Or you may simply zip deepstream_test3_app.c and attach it.

Besides, we have to launch 2+ sources to reproduce it, right?

Sorry, I forgot to add -u to my diff command to create the patch. Here’s the patch:

--- deepstream_test3_app.c	2019-09-11 09:49:43.000000000 +0000
+++ /code/deepstream_test3_app.c	2019-10-03 16:25:09.258070943 +0000
@@ -67,6 +67,8 @@
 /* tiler_sink_pad_buffer_probe  will extract metadata received on OSD sink pad
  * and update params for drawing rectangle, object information etc. */
 
+GstElement *pipeline = NULL;
+
 static GstPadProbeReturn
 tiler_src_pad_buffer_probe (GstPad * pad, GstPadProbeInfo * info,
     gpointer u_data)
@@ -134,14 +136,37 @@
     return GST_PAD_PROBE_OK;
 }
 
+GstPadProbeReturn
+pad_probe_cb (GstPad * pad, GstPadProbeInfo * info, gpointer user_data) {
+  g_print("in pad probe callback");
+  gst_pad_remove_probe (pad, GST_PAD_PROBE_INFO_ID (info));
+  
+  GstElement *uri_decode_bin = gst_bin_get_by_name(GST_BIN(pipeline), "uri-decode-bin");
+  gst_element_set_state (uri_decode_bin, GST_STATE_NULL);
+
+  GstElement *streammux = gst_bin_get_by_name(GST_BIN(pipeline), "stream-muxer");
+  gchar pad_name[16] = { };
+  g_snprintf (pad_name, 15, "sink_%u", 0);
+  GstPad *sinkpad;
+  sinkpad = gst_element_get_static_pad (streammux, pad_name);
+  if (!sinkpad) {
+    g_printerr ("Streammux request sink pad failed. Exiting.\n");
+    return -1;
+  }
+  g_print ("Before release\n");
+  gst_element_release_request_pad (streammux, sinkpad);
+  g_print ("After release\n");
+  gst_object_unref(sinkpad);
+}
+
 static gboolean
 bus_call (GstBus * bus, GstMessage * msg, gpointer data)
 {
   GMainLoop *loop = (GMainLoop *) data;
   switch (GST_MESSAGE_TYPE (msg)) {
     case GST_MESSAGE_EOS:
-      g_print ("End of stream\n");
-      g_main_loop_quit (loop);
+      //g_print ("End of stream\n");
+      //g_main_loop_quit (loop);
       break;
     case GST_MESSAGE_WARNING:
     {
@@ -176,7 +201,26 @@
         guint stream_id;
         if (gst_nvmessage_parse_stream_eos (msg, &stream_id)) {
           g_print ("Got EOS from stream %d\n", stream_id);
-        }
+          GstElement *streammux = gst_bin_get_by_name(GST_BIN(pipeline), "stream-muxer");
+          gchar pad_name[16] = { };
+          g_snprintf (pad_name, 15, "sink_%u", 0);
+          GstPad *sinkpad;
+          sinkpad = gst_element_get_static_pad (streammux, pad_name);
+          if (!sinkpad) {
+            g_printerr ("Streammux request sink pad failed. Exiting.\n");
+            return -1;
+	  }
+	  GstPad *srcpad = gst_pad_get_peer(sinkpad);
+	  gst_object_unref(sinkpad);
+	  if (!srcpad) {
+            g_printerr ("Peer request source pad failed. Exiting.\n");
+            return -1;
+	  }
+            
+	  gst_pad_add_probe (srcpad, GST_PAD_PROBE_TYPE_BLOCK_DOWNSTREAM,
+      	      pad_probe_cb, NULL, NULL);
+	  gst_object_unref(srcpad);
+	}
       }
       break;
     }
@@ -283,7 +327,7 @@
 main (int argc, char *argv[])
 {
   GMainLoop *loop = NULL;
-  GstElement *pipeline = NULL, *streammux = NULL, *sink = NULL, *pgie = NULL,
+  GstElement *streammux = NULL, *sink = NULL, *pgie = NULL,
       *nvvidconv = NULL, *nvosd = NULL, *tiler = NULL;
 #ifdef PLATFORM_TEGRA
   GstElement *transform = NULL;
@@ -370,7 +414,7 @@
 #ifdef PLATFORM_TEGRA
   transform = gst_element_factory_make ("nvegltransform", "nvegl-transform");
 #endif
-  sink = gst_element_factory_make ("nveglglessink", "nvvideo-renderer");
+  sink = gst_element_factory_make ("testsink", "nvvideo-renderer");
 
   if (!pgie || !tiler || !nvvidconv || !nvosd || !sink) {
     g_printerr ("One element could not be created. Exiting.\n");

Also, the deadlock is seen by playing a single file. Playing two files causes the longer file to stop being processed after the first one is finished and the pad is released.

Hi,
It seems not right if you do not call g_main_loop_quit (loop); when EOS is received. Maybe it is the reason of deadlock?

It isn’t the EOS message change in the above. Here is one with even fewer changes. That other one had my attempt at adding a blocking pad in it. This one is the least number of changes to reproduce it.

--- deepstream_sdk_v4.0.1_x86_64/sources/apps/sample_apps/deepstream-test3/deepstream_test3_app.c	2019-09-11 03:49:43.000000000 -0600
+++ deepstream_test3_app.c	2019-10-07 10:27:00.000000000 -0600
@@ -63,6 +63,7 @@
 //static struct timeval start_time = { };

 //static guint probe_counter = 0;
+GstElement *pipeline = NULL;

 /* tiler_sink_pad_buffer_probe  will extract metadata received on OSD sink pad
  * and update params for drawing rectangle, object information etc. */
@@ -176,6 +177,18 @@
         guint stream_id;
         if (gst_nvmessage_parse_stream_eos (msg, &stream_id)) {
           g_print ("Got EOS from stream %d\n", stream_id);
+          GstElement *streammux = gst_bin_get_by_name(GST_BIN(pipeline), "stream-muxer");
+          gchar pad_name[16] = { };
+          g_snprintf (pad_name, 15, "sink_%u", 0);
+          GstPad *sinkpad;
+          sinkpad = gst_element_get_static_pad (streammux, pad_name);
+          if (!sinkpad) {
+            g_printerr ("Streammux request sink pad failed. Exiting.\n");
+            return -1;
+	  }
+          g_print ("Before release\n");
+	  gst_element_release_request_pad (streammux, sinkpad);
+          g_print ("After release\n");
         }
       }
       break;
@@ -283,7 +296,7 @@
 main (int argc, char *argv[])
 {
   GMainLoop *loop = NULL;
-  GstElement *pipeline = NULL, *streammux = NULL, *sink = NULL, *pgie = NULL,
+  GstElement *streammux = NULL, *sink = NULL, *pgie = NULL,
       *nvvidconv = NULL, *nvosd = NULL, *tiler = NULL;
 #ifdef PLATFORM_TEGRA
   GstElement *transform = NULL;
@@ -370,7 +383,7 @@
 #ifdef PLATFORM_TEGRA
   transform = gst_element_factory_make ("nvegltransform", "nvegl-transform");
 #endif
-  sink = gst_element_factory_make ("nveglglessink", "nvvideo-renderer");
+  sink = gst_element_factory_make ("testsink", "nvvideo-renderer");

   if (!pgie || !tiler || !nvvidconv || !nvosd || !sink) {
     g_printerr ("One element could not be created. Exiting.\n");

My whole goal with this is to be able to keep the stream running forever and keep adding and removing sources. I’m trying to save myself from having to reload the model. I want to make a deepstream pipeline that I can pass various video clips into on-demand and out of which I can get inference results. Most of the examples in deepstream right now are more for having a fixed number of streams that are always up like for surveillance at a grocery store. However, that is not my use case. I have lots of video clips that I want to run through an inferrer. These clips are being uploaded frequently and I want to pass them through the pipeline as they are received. Deepstream looks like a great candidate for this because it has elements for demuxing, parsing, and decoding as well as inference. If this isn’t considered a valid use case for Deepstream, I urge Nvidia to add it.

Hi,
It is similar request as
[url]https://devtalk.nvidia.com/default/topic/1064141/deepstream-sdk/adding-and-removing-streams-during-runtime/post/5390153/#5390153[/url]

We are checking and will update.

I thought that I’d post another finding. nvstreamdemux needs to be in a Null state to add sink request pads. nvstreammux doesn’t have this requirement. This is separate from the above issue, but something that is a problem if a user wants to split streams out after doing inference for instance.

Here are some more findings:

I updated the reference anomaly app to get it working with version DS 4.0.1. It also deadlocks when it tries to release the pad. I haven’t tried going back to DS 3. Does anyone know if it deadlocks during the release of the nvstream pad back in DS 3?

I tried a different tack that would work for my use case. I tried setting up N number of streams and then tearing down everything except nvinfer. I’ve tried pausing nvinfer, but it seems to go to the null state anyway. My whole goal was to keep nvinfer from loading the model again, but that didn’t work out. Also, the pipeline seems to get stuck when I try to send through the second set of streams. Then again, if nvinfer is reloading the model anyway, I may as well destroy that element and recreate it.

Has anyone found any way to work around these issues? All of my video input streams are of the same length so I’m fine with waiting until a batch of streams are done to move on to the next batch, but I really want to avoid reloading the model. I’ll keep trying different things to work around this.

I’ve made some progress with a workaround for dynamically adding and removing streams. My particular use case involves a nvstreammux → nvinfer → nvstreamdemux middle part of the pipeline. Here are some findings:

  1. All of the request pads need to be set up at the beginning.
  2. The request pads must never be released as part of adding and removing streams. Rather they are linked and unlinked.
  3. When a stream is finished as detected by using gst_nvmessage_is_stream_eos and gst_nvmessage_parse_stream_eos in the message bus when an element is found, start the tear down.
  4. Tearing down involves pausing everything that is running, setting the state of the source and sink that we want to remove to the Null state and then unlinking and removing from the pipeline.
  5. For adding streams it seems to work better to pause anything that is playing in the pipeline first.
  6. Be sure to use a lock to control the manipulation of the pipeline so that only one thread is modifying it at a time. I lock it for all of the add code, and for all of the remove code.

I’m stilling having some issues though. With two streams being added/removed asynchronously, it works for like 12-15 streams added/removed and then errors out with:

0:00:21.177111588  2961 0x55b338a39e30 WARN                 nvinfer gstnvinfer.cpp:1830:gst_nvinfer_output_loop:<pgie> error: Internal data stream error.
0:00:21.177621422  2961 0x55b338a39e30 WARN                 nvinfer gstnvinfer.cpp:1830:gst_nvinfer_output_loop:<pgie> error: streaming stopped, reason error (-5)

I haven’t found anything in the logs that gives more insight into this error. Can someone from Nvidia let us know what that error means? I have also seen occasionally NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY. The GPU isn’t running out of memory.

However, my code works fine if I limit it to only one stream at a time. For now that looks like what I’ll have to do. At least with this way I don’t have to reload the model. I will try queuing up other sources into the playing state, but unlinked. Then I will link up the new source ASAP after the previous source is torn down. I’m don’t yet know how much this will slow down throughput, but it seems my best option for now.

One other thought that has occurred to me is that there might be a problem with having a request pad that isn’t linked when the middle elements are in the playing state. I’ll investigate this if I find time to do so.

2 Likes

This should be resolved:
https://devtalk.nvidia.com/default/topic/1064141/deepstream-sdk/adding-and-removing-streams-during-runtime/post/5400986/#5400986

Hi @notmuchtotell,

We have reproduced the deadlock issue in streammux release pad function and this will be resolved in a future release. Thank you for sharing your analysis on the problem.

1 Like

I look forward to the next release. I’m having other problems even after the example given in the adding-and-removing-streams-during-runtime thread. I’ll create a new topic for it.

Hello @CJR, any update or information for this next release solved for this issue? I found this deadlock happened when there are only one source camera, but when have more than one. The deadlock would not happen. Somehow the streammux always need to feed data in buffer ?

Is the problem solved now?

yes it has been fixed.

1 Like

@CJR and @DaneLLL, I’m still running into this problem with the latest release…
If using a single RTSP Source connected to the nvstreammux, and if the source times out - i.e. with a message of

ERROR  default gstrtspconnection.c:1004:gst_rtsp_connection_connect_with_response: failed to connect: Could not connect to 192.168.0.254: Socket I/O timed out

the following code and call to gst_element_release_request_pad will result in deadlock

        gst_pad_send_event(requestSinkPad, gst_event_new_flush_stop(FALSE));
        if (!gst_pad_unlink(staticSrcPad, requestSinkPad))
        {
           // log error ---
            return false;
        }
        gst_element_release_request_pad(streammux, requestSinkPad]);
        gst_object_unref(requestSinkPad);

Is there something else that I need to do?

Thanks,
Robert.

Hi,
There is a sample to demonstrate it:

Please take a look.

Hi @DaneLLL yes, I’m familiar with the example… and if you look at my code snippet above it is identical.

Your example only deals with removing sources that have been successfully connected. If you are using multiple RTSP sources and the first source fails to connect, calling the above snipped of code to unlink the failed source from the streammux will result in a deadlock.

Once you have a connection the above code works correctly to unlink and remove.

Hi @prominence_ai
If you use DeepStream SDK 5.0 GA, please make a new post to share your environment(use which desktop GPU or Jetson platform) and steps for reproducing the issue. If the default runtime_source_add_delete sample cannot be run to reproduce the issue, please provide a patch on it.

@DaneLLL please see Releasing nvstreammux request pad results in a deadlock