Potential deadlock when opening video decoder and calling cudaSetDevice() concurrently

We have a quite complex application which both uses CUDA, the video decoder, and jpeg encoder, utilizing multiple threads to run things concurrently. This has worked well for many years. However, now we have seen an issue in one variation of our application where it is possible to get a deadlock when one thread is creating an Nvidia decoder at the same time that another thread is calling cudaDeviceSynchronize() or cudaSetDevice(). I dug a bit into the decoder creation function and concluded that the issue is within v4l2_open(“/dev/nvhost-nvdec”). Inspecting the deadlock with a debugger, we see that both calls hang inside cuInit().

We observe this problem on both a TX2 dev kit using a vanilla L4T 32.7.6 image, and on an embedded TX2i system running our own Yocto system, also based on 32.7.6. We have also tried it on another TX2 devboard running an older L4T, 32.6.1, but there the issue does not seem to ever happen.

We have made the following minimal reproducing example which demonstrates the problem. It does not happen 100% of the time, but about 50% of 30 runs were affected.

#include <cuda_runtime.h>

#include <fcntl.h>
#include <iostream>
#include <libv4l2.h>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <unistd.h>

int main()
{
    std::mutex              mut;
    std::condition_variable cv;
    bool                    start{false};

    std::thread openDecoderThread(
        [&]
        {
            {
                std::unique_lock lk(mut);
                cv.wait(lk, [&] { return start; });
            }

            int fd = v4l2_open("/dev/nvhost-nvdec", O_NONBLOCK | O_RDWR);
            std::cout << "decoder initialized" << std::endl;
            (void)fd;
        });

    std::thread cudaSetDeviceThread(
        [&]
        {
            {
                std::unique_lock lk(mut);
                cv.wait(lk, [&] { return start; });
            }

            cudaSetDevice(0);
            std::cout << "after cudaSetDevice" << std::endl;
        });


    std::cout << "setting start to true" << std::endl;
    {
        std::unique_lock lk(mut);
        start = true;
    }

    std::cout << "notifying" << std::endl;
    cv.notify_all();

    std::cout << "joining" << std::endl;
    openDecoderThread.join();
    cudaSetDeviceThread.join();

    std::cout << "end of main" << std::endl;

    return 0;
}

Here is the callstack of the two threads:

  1. calling cudaSetDevice():
#0  0x0000007f8a1a9690 in __lll_lock_wait (futex=0x7f8a2429d0 <_rtld_global+2440>, private=0) at lowlevellock.c:46
#1  0x0000007f8a1a27d8 in __GI___pthread_mutex_lock (mutex=0x7f8a2429d0 <_rtld_global+2440>) at pthread_mutex_lock.c:115
#2  0x0000007f8a2253d0 in _dl_open (file=0x7f8a1dec40 "", mode=-2147483647, caller_dlopen=0x7f832385dc, nsid=0, argc=1, argv=0x7ff26b7808, env=0x7f89e7e794 <__GI__dl_catch_exception+116>) at dl-open.c:553
#3  0x0000007f8a1dd014 in dlopen_doit (a=0x7f89432368) at dlopen.c:66
#4  0x0000007f89e7e794 in __GI__dl_catch_exception (exception=0x7f8a2417a8 <__stack_chk_guard>, exception@entry=0x7f89432300, operate=0x7f8943215c, 
    operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f894322e0, args@entry=0x7f89432368) at dl-error-skeleton.c:196
#5  0x0000007f89e7e838 in __GI__dl_catch_error (objname=objname@entry=0x7f84000b30, errstring=errstring@entry=0x7f84000b38, mallocedp=mallocedp@entry=0x7f84000b28, operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=args@entry=0x7f89432368) at dl-error-skeleton.c:215
#6  0x0000007f8a1de780 in _dlerror_run (operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f89432368, args@entry=0x7f89432378) at dlerror.c:162
#7  0x0000007f8a1dd0e8 in __dlopen (file=<optimized out>, mode=<optimized out>) at dlopen.c:87
#8  0x0000007f832385dc in  () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#9  0x0000007f832044d4 in  () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#10 0x0000007f83173714 in  () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#11 0x0000007f832758e4 in cuInit () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#12 0x00000055678202bc in cudart::__loadDriverInternalUtil() ()
#13 0x0000007f8a1a7c78 in __pthread_once_slow (once_control=0x556788a0c8 <cudart::globalState::loadDriver()::loadDriverControl>, init_routine=0x5567820240 <cudart::__loadDriverInternalUtil()>) at pthread_once.c:116
#14 0x0000005567857144 in cudart::cuosOnce(int*, void (*)()) ()
#15 0x000000556781d6e4 in cudart::globalState::initializeDriver() ()
#16 0x0000005567839f18 in cudaSetDevice ()
#17 0x000000556780d528 in <lambda()>::operator() (__closure=<optimized out>) at /home/user/evs/apps/misc/deadlocktest.cpp:44


  1. calling vl42_open(“/dev/nvhost-nvdec“):
#0  0x0000007f89e2d238 in sched_yield () at ../sysdeps/unix/syscall-template.S:78
#1  0x0000007f832045b8 in  () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#2  0x0000007f83173714 in  () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#3  0x0000007f832758e4 in cuInit () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#4  0x0000007f88a55118 in  () at /usr/lib/aarch64-linux-gnu/tegra/libnvcuvidv4l2.so
#5  0x0000007f8a221a34 in call_init (l=<optimized out>, argc=argc@entry=1, argv=argv@entry=0x7ff26b7808, env=env@entry=0x7ff26b7818) at dl-init.c:72
#6  0x0000007f8a221b38 in call_init (env=0x7ff26b7818, argv=0x7ff26b7808, argc=1, l=<optimized out>) at dl-init.c:118
#7  0x0000007f8a221b38 in _dl_init (main_map=main_map@entry=0x7f7c004050, argc=1, argv=0x7ff26b7808, env=0x7ff26b7818) at dl-init.c:119
#8  0x0000007f8a225cd8 in dl_open_worker (a=0x7f89c32fe8) at dl-open.c:522
#9  0x0000007f89e7e794 in __GI__dl_catch_exception (exception=0xfffffffffffffffe, exception@entry=0x7f89c32fd0, operate=0x7f89c32e0c, 
    operate@entry=0x7f8a2257d8 <dl_open_worker>, args=0x7f89c32fd0, args@entry=0x7f89c32fe8) at dl-error-skeleton.c:196
#10 0x0000007f8a225418 in _dl_open (file=0x7f7c000bc0 "/usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvcuvidvideocodec.so", mode=-2147483647, caller_dlopen=0x7f8a08beec, nsid=-2, argc=1, argv=0x7ff26b7808, env=<optimized out>) at dl-open.c:605
#11 0x0000007f8a1dd014 in dlopen_doit (a=0x7f89c332a8) at dlopen.c:66
#12 0x0000007f89e7e794 in __GI__dl_catch_exception (exception=0x7f8a2417a8 <__stack_chk_guard>, exception@entry=0x7f89c33240, operate=0x7f89c3309c, 
    operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f89c33220, args@entry=0x7f89c332a8) at dl-error-skeleton.c:196
#13 0x0000007f89e7e838 in __GI__dl_catch_error (objname=objname@entry=0x7f7c000c80, errstring=errstring@entry=0x7f7c000c88, mallocedp=mallocedp@entry=0x7f7c000c78, operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=args@entry=0x7f89c332a8) at dl-error-skeleton.c:215
#14 0x0000007f8a1de780 in _dlerror_run (operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f89c332a8, args@entry=0x7f89c332b8) at dlerror.c:162
#15 0x0000007f8a1dd0e8 in __dlopen (file=<optimized out>, mode=<optimized out>) at dlopen.c:87
#16 0x0000007f8a08beec in  () at /usr/lib/aarch64-linux-gnu/libv4l2.so.0
#17 0x0000007f8a0877d4 in v4l2_fd_open () at /usr/lib/aarch64-linux-gnu/libv4l2.so.0
#18 0x0000007f8a087f4c in v4l2_open () at /usr/lib/aarch64-linux-gnu/libv4l2.so.0
#19 0x000000556780d3b8 in <lambda()>::operator() (__closure=<optimized out>) at /home/user/evs/apps/misc/deadlocktest.cpp:31

We have never had problems with running these things on different threads before, and it is really not an option for us to do everything on one thread. Is this a bug in the runtime? Again, we have not been able to repeat it on previous L4T versions so it seems like a regression.

Hi,
Because Jetpack 4 is EoL and there is no plan to release new version, we would suggest add delay or mutex lock in application, to avoid the race condition.

Hi,

We have now tried the same experiment on an AGX Orin system running SW version 36.2. There we seem to get the issue even more frequently, close to 100% of the time. The program is identical, except on Orin we instead open /dev/v4l2-nvdec.

Here are the callstacks for the two threads:

Thread 1 (open nvdec)

#0 0x0000fffff7b3eccc in sched_yield () at ../sysdeps/unix/syscall-template.S:120
#1 0x0000fffff4e2d6d0 in () at /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so.1
#2 0x0000fffff4e89450 in () at /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so.1
#3 0x0000fffff493b0c8 in () at /usr/lib/aarch64-linux-gnu/nvidia/libnvcuvidv4l2.so
#4 0x0000fffff7fc7624 [PAC] in call_init (env=0xfffffffff278, argv=0xfffffffff268, argc=1, l=) at ./elf/dl-init.c:70
#5 call_init (l=, argc=1, argv=0xfffffffff268, env=0xfffffffff278) at ./elf/dl-init.c:26
#6 0x0000fffff7fc772c in _dl_init (main_map=0xfffff0020870, argc=1, argv=0xfffffffff268, env=0xfffffffff278) at ./elf/dl-init.c:117
#7 0x0000fffff7b9d220 in __GI__dl_catch_exception (exception=0x0, operate=0xfffff7fcdd20 <call_dl_init>, args=0xfffff793b540) at ./elf/dl-error-skeleton.c:182
#8 0x0000fffff7fcdf5c in dl_open_worker (a=a@entry=0xfffff793b788) at ./elf/dl-open.c:808
#9 0x0000fffff7b9d1c8 in __GI__dl_catch_exception (exception=0xfffff793b770, operate=0xfffff7fcdeb4 <dl_open_worker>, args=0xfffff793b788) at ./elf/dl-error-skeleton.c:208
#10 0x0000fffff7fce2fc in _dl_open
(file=0xfffff0000bc0 “/usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvcuvidvideocodec.so”, mode=-2147483647, caller_dlopen=0xfffff7e88830, nsid=-2, argc=1, argv=0xfffffffff268, env=0xfffffffff278) at ./elf/dl-open.c:883
#11 0x0000fffff7ae96e4 in dlopen_doit (a=a@entry=0xfffff793ba78) at ./dlfcn/dlopen.c:56
#12 0x0000fffff7b9d1c8 in __GI__dl_catch_exception (exception=exception@entry=0xfffff793b9d0, operate=0xfffff7ae9680 <dlopen_doit>, args=0xfffff793ba78) at ./elf/dl-error-skeleton.c:208
#13 0x0000fffff7b9d290 in __GI__dl_catch_error (objname=0xfffff793ba48, errstring=0xfffff793ba50, mallocedp=0xfffff793ba47, operate=, args=)
at ./elf/dl-error-skeleton.c:227
#14 0x0000fffff7ae91c0 in _dlerror_run (operate=operate@entry=0xfffff7ae9680 <dlopen_doit>, args=args@entry=0xfffff793ba78) at ./dlfcn/dlerror.c:138
#15 0x0000fffff7ae9784 in dlopen_implementation (dl_caller=, mode=, file=) at ./dlfcn/dlopen.c:71
#16 ___dlopen (file=, mode=) at ./dlfcn/dlopen.c:81
#17 0x0000fffff7e88830 in () at /lib/aarch64-linux-gnu/libv4l2.so.0
#18 0x0000fffff7e83260 [PAC] in v4l2_fd_open () at /lib/aarch64-linux-gnu/libv4l2.so.0
#19 0x0000fffff7e83970 [PAC] in v4l2_open () at /lib/aarch64-linux-gnu/libv4l2.so.0
#20 0x0000aaaaaaaaa0d0 [PAC] in operator()() const (__closure=0xaaaaaab78138) at /home/user/test-samples/test_cuda_v4l2.cu:32
#21 0x0000aaaaaaaaada8 in std::__invoke_impl<void, main()::<lambda()> >(std::__invoke_other, struct {…} &&) (__f=…) at /usr/include/c++/11/bits/invoke.h:61
#22 0x0000aaaaaaaaad1c in std::__invoke<main()::<lambda()> >(struct {…} &&) (__fn=…) at /usr/include/c++/11/bits/invoke.h:96
#23 0x0000aaaaaaaaac50 in std::thread::_Invoker<std::tuple<main()::<lambda()> > >::_M_invoke<0>(std::_Index_tuple<0>) (this=0xaaaaaab78138) at /usr/include/c++/11/bits/std_thread.h:259
#24 0x0000aaaaaaaaabf8 in std::thread::_Invoker<std::tuple<main()::<lambda()> > >::operator()(void) (this=0xaaaaaab78138) at /usr/include/c++/11/bits/std_thread.h:266
#25 0x0000aaaaaaaaabb8 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<main()::<lambda()> > > >::_M_run(void) (this=0xaaaaaab78130)
at /usr/include/c++/11/bits/std_thread.h:211
#26 0x0000fffff7d231fc in () at /lib/aarch64-linux-gnu/libstdc++.so.6
#27 0x0000fffff7aed5c8 in start_thread (arg=0x0) at ./nptl/pthread_create.c:442
#28 0x0000fffff7b55d9c in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79

Thread 2 (cudaSetDevice):

#0 futex_wait (private=0, expected=2, futex_word=0xfffff7ffeac8 <_rtld_global+2696>) at ../sysdeps/nptl/futex-internal.h:146
#1 __GI___lll_lock_wait (futex=futex@entry=0xfffff7ffeac8 <_rtld_global+2696>, private=private@entry=0) at ./nptl/lowlevellock.c:49
#2 0x0000fffff7af0768 in lll_mutex_lock_optimized (mutex=0xfffff7ffeac8 <_rtld_global+2696>) at ./nptl/pthread_mutex_lock.c:48
#3 ___pthread_mutex_lock (mutex=0xfffff7ffeac8 <_rtld_global+2696>) at ./nptl/pthread_mutex_lock.c:128
#4 0x0000fffff7fce2b8 in _dl_open (file=0xfffff7bc2358 “”, mode=-2147483647, caller_dlopen=0xfffff4e7c520, nsid=0, argc=1, argv=0xfffffffff268, env=0xfffffffff278)
at ./elf/dl-open.c:830
#5 0x0000fffff7ae96e4 in dlopen_doit (a=a@entry=0xfffff712b4d8) at ./dlfcn/dlopen.c:56
#6 0x0000fffff7b9d1c8 in __GI__dl_catch_exception (exception=exception@entry=0xfffff712b430, operate=0xfffff7ae9680 <dlopen_doit>, args=0xfffff712b4d8) at ./elf/dl-error-skeleton.c:208
#7 0x0000fffff7b9d290 in __GI__dl_catch_error (objname=0xfffff712b4a8, errstring=0xfffff712b4b0, mallocedp=0xfffff712b4a7, operate=, args=)
at ./elf/dl-error-skeleton.c:227
#8 0x0000fffff7ae91c0 in _dlerror_run (operate=operate@entry=0xfffff7ae9680 <dlopen_doit>, args=args@entry=0xfffff712b4d8) at ./dlfcn/dlerror.c:138
#9 0x0000fffff7ae9784 in dlopen_implementation (dl_caller=, mode=, file=) at ./dlfcn/dlopen.c:71
#10 ___dlopen (file=, mode=) at ./dlfcn/dlopen.c:81
#11 0x0000fffff4e7c520 in () at /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so.1
#12 0x0000fffff4e2d334 in () at /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so.1
#13 0x0000fffff4e89450 in () at /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so.1
#14 0x0000aaaaaaad634c in libcudart_static_aa4a6bcb5fce58be20d542d9b467101e0a9360a5 ()
#15 0x0000aaaaaaad6534 [PAC] in libcudart_static_0bf7336e71b5df655f7fe4ef2dea52179e6fcf82 ()
#16 0x0000fffff7af25d4 [PAC] in __pthread_once_slow
(once_control=0xaaaaaab64140 <libcudart_static_08cd9c81021de025f90d77a527f44bb6a85f7117>, init_routine=0xaaaaaaad6498 <libcudart_static_0bf7336e71b5df655f7fe4ef2dea52179e6fcf82>)
at ./nptl/pthread_once.c:116
#17 0x0000aaaaaab25c50 in libcudart_static_5887a27cefafb4cd438bdc166b0a6f874b079d4b ()
#18 0x0000aaaaaaaca524 [PAC] in libcudart_static_418eebf4e9b7463362b8385a31d08da131d0ea88 ()
#19 0x0000aaaaaaaf27e8 [PAC] in cudaSetDevice ()
#20 0x0000aaaaaaaaa1e0 [PAC] in operator()() const (__closure=0xaaaaaab782a8) at /home/user/test-samples/test_cuda_v4l2.cu:47
#21 0x0000aaaaaaaaad64 in std::__invoke_impl<void, main()::<lambda()> >(std::__invoke_other, struct {…} &&) (__f=…) at /usr/include/c++/11/bits/invoke.h:61
#22 0x0000aaaaaaaaacb4 in std::__invoke<main()::<lambda()> >(struct {…} &&) (__fn=…) at /usr/include/c++/11/bits/invoke.h:96
#23 0x0000aaaaaaaaac24 in std::thread::_Invoker<std::tuple<main()::<lambda()> > >::_M_invoke<0>(std::_Index_tuple<0>) (this=0xaaaaaab782a8) at /usr/include/c++/11/bits/std_thread.h:259
#24 0x0000aaaaaaaaabd8 in std::thread::_Invoker<std::tuple<main()::<lambda()> > >::operator()(void) (this=0xaaaaaab782a8) at /usr/include/c++/11/bits/std_thread.h:266
#25 0x0000aaaaaaaaab94 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<main()::<lambda()> > > >::_M_run(void) (this=0xaaaaaab782a0)
at /usr/include/c++/11/bits/std_thread.h:211
#26 0x0000fffff7d231fc in () at /lib/aarch64-linux-gnu/libstdc++.so.6
#27 0x0000fffff7aed5c8 in start_thread (arg=0x0) at ./nptl/pthread_create.c:442
#28 0x0000fffff7b55d9c in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79

Hi,
R36.2 is not a stable version. Please help test Jetpack 6.2.2 r36.5.