We have a quite complex application which both uses CUDA, the video decoder, and jpeg encoder, utilizing multiple threads to run things concurrently. This has worked well for many years. However, now we have seen an issue in one variation of our application where it is possible to get a deadlock when one thread is creating an Nvidia decoder at the same time that another thread is calling cudaDeviceSynchronize() or cudaSetDevice(). I dug a bit into the decoder creation function and concluded that the issue is within v4l2_open(“/dev/nvhost-nvdec”). Inspecting the deadlock with a debugger, we see that both calls hang inside cuInit().
We observe this problem on both a TX2 dev kit using a vanilla L4T 32.7.6 image, and on an embedded TX2i system running our own Yocto system, also based on 32.7.6. We have also tried it on another TX2 devboard running an older L4T, 32.6.1, but there the issue does not seem to ever happen.
We have made the following minimal reproducing example which demonstrates the problem. It does not happen 100% of the time, but about 50% of 30 runs were affected.
#include <cuda_runtime.h>
#include <fcntl.h>
#include <iostream>
#include <libv4l2.h>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <unistd.h>
int main()
{
std::mutex mut;
std::condition_variable cv;
bool start{false};
std::thread openDecoderThread(
[&]
{
{
std::unique_lock lk(mut);
cv.wait(lk, [&] { return start; });
}
int fd = v4l2_open("/dev/nvhost-nvdec", O_NONBLOCK | O_RDWR);
std::cout << "decoder initialized" << std::endl;
(void)fd;
});
std::thread cudaSetDeviceThread(
[&]
{
{
std::unique_lock lk(mut);
cv.wait(lk, [&] { return start; });
}
cudaSetDevice(0);
std::cout << "after cudaSetDevice" << std::endl;
});
std::cout << "setting start to true" << std::endl;
{
std::unique_lock lk(mut);
start = true;
}
std::cout << "notifying" << std::endl;
cv.notify_all();
std::cout << "joining" << std::endl;
openDecoderThread.join();
cudaSetDeviceThread.join();
std::cout << "end of main" << std::endl;
return 0;
}
Here is the callstack of the two threads:
- calling cudaSetDevice():
#0 0x0000007f8a1a9690 in __lll_lock_wait (futex=0x7f8a2429d0 <_rtld_global+2440>, private=0) at lowlevellock.c:46
#1 0x0000007f8a1a27d8 in __GI___pthread_mutex_lock (mutex=0x7f8a2429d0 <_rtld_global+2440>) at pthread_mutex_lock.c:115
#2 0x0000007f8a2253d0 in _dl_open (file=0x7f8a1dec40 "", mode=-2147483647, caller_dlopen=0x7f832385dc, nsid=0, argc=1, argv=0x7ff26b7808, env=0x7f89e7e794 <__GI__dl_catch_exception+116>) at dl-open.c:553
#3 0x0000007f8a1dd014 in dlopen_doit (a=0x7f89432368) at dlopen.c:66
#4 0x0000007f89e7e794 in __GI__dl_catch_exception (exception=0x7f8a2417a8 <__stack_chk_guard>, exception@entry=0x7f89432300, operate=0x7f8943215c,
operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f894322e0, args@entry=0x7f89432368) at dl-error-skeleton.c:196
#5 0x0000007f89e7e838 in __GI__dl_catch_error (objname=objname@entry=0x7f84000b30, errstring=errstring@entry=0x7f84000b38, mallocedp=mallocedp@entry=0x7f84000b28, operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=args@entry=0x7f89432368) at dl-error-skeleton.c:215
#6 0x0000007f8a1de780 in _dlerror_run (operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f89432368, args@entry=0x7f89432378) at dlerror.c:162
#7 0x0000007f8a1dd0e8 in __dlopen (file=<optimized out>, mode=<optimized out>) at dlopen.c:87
#8 0x0000007f832385dc in () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#9 0x0000007f832044d4 in () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#10 0x0000007f83173714 in () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#11 0x0000007f832758e4 in cuInit () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#12 0x00000055678202bc in cudart::__loadDriverInternalUtil() ()
#13 0x0000007f8a1a7c78 in __pthread_once_slow (once_control=0x556788a0c8 <cudart::globalState::loadDriver()::loadDriverControl>, init_routine=0x5567820240 <cudart::__loadDriverInternalUtil()>) at pthread_once.c:116
#14 0x0000005567857144 in cudart::cuosOnce(int*, void (*)()) ()
#15 0x000000556781d6e4 in cudart::globalState::initializeDriver() ()
#16 0x0000005567839f18 in cudaSetDevice ()
#17 0x000000556780d528 in <lambda()>::operator() (__closure=<optimized out>) at /home/user/evs/apps/misc/deadlocktest.cpp:44
- calling vl42_open(“/dev/nvhost-nvdec“):
#0 0x0000007f89e2d238 in sched_yield () at ../sysdeps/unix/syscall-template.S:78
#1 0x0000007f832045b8 in () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#2 0x0000007f83173714 in () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#3 0x0000007f832758e4 in cuInit () at /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#4 0x0000007f88a55118 in () at /usr/lib/aarch64-linux-gnu/tegra/libnvcuvidv4l2.so
#5 0x0000007f8a221a34 in call_init (l=<optimized out>, argc=argc@entry=1, argv=argv@entry=0x7ff26b7808, env=env@entry=0x7ff26b7818) at dl-init.c:72
#6 0x0000007f8a221b38 in call_init (env=0x7ff26b7818, argv=0x7ff26b7808, argc=1, l=<optimized out>) at dl-init.c:118
#7 0x0000007f8a221b38 in _dl_init (main_map=main_map@entry=0x7f7c004050, argc=1, argv=0x7ff26b7808, env=0x7ff26b7818) at dl-init.c:119
#8 0x0000007f8a225cd8 in dl_open_worker (a=0x7f89c32fe8) at dl-open.c:522
#9 0x0000007f89e7e794 in __GI__dl_catch_exception (exception=0xfffffffffffffffe, exception@entry=0x7f89c32fd0, operate=0x7f89c32e0c,
operate@entry=0x7f8a2257d8 <dl_open_worker>, args=0x7f89c32fd0, args@entry=0x7f89c32fe8) at dl-error-skeleton.c:196
#10 0x0000007f8a225418 in _dl_open (file=0x7f7c000bc0 "/usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvcuvidvideocodec.so", mode=-2147483647, caller_dlopen=0x7f8a08beec, nsid=-2, argc=1, argv=0x7ff26b7808, env=<optimized out>) at dl-open.c:605
#11 0x0000007f8a1dd014 in dlopen_doit (a=0x7f89c332a8) at dlopen.c:66
#12 0x0000007f89e7e794 in __GI__dl_catch_exception (exception=0x7f8a2417a8 <__stack_chk_guard>, exception@entry=0x7f89c33240, operate=0x7f89c3309c,
operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f89c33220, args@entry=0x7f89c332a8) at dl-error-skeleton.c:196
#13 0x0000007f89e7e838 in __GI__dl_catch_error (objname=objname@entry=0x7f7c000c80, errstring=errstring@entry=0x7f7c000c88, mallocedp=mallocedp@entry=0x7f7c000c78, operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=args@entry=0x7f89c332a8) at dl-error-skeleton.c:215
#14 0x0000007f8a1de780 in _dlerror_run (operate=operate@entry=0x7f8a1dcfb0 <dlopen_doit>, args=0x7f89c332a8, args@entry=0x7f89c332b8) at dlerror.c:162
#15 0x0000007f8a1dd0e8 in __dlopen (file=<optimized out>, mode=<optimized out>) at dlopen.c:87
#16 0x0000007f8a08beec in () at /usr/lib/aarch64-linux-gnu/libv4l2.so.0
#17 0x0000007f8a0877d4 in v4l2_fd_open () at /usr/lib/aarch64-linux-gnu/libv4l2.so.0
#18 0x0000007f8a087f4c in v4l2_open () at /usr/lib/aarch64-linux-gnu/libv4l2.so.0
#19 0x000000556780d3b8 in <lambda()>::operator() (__closure=<optimized out>) at /home/user/evs/apps/misc/deadlocktest.cpp:31
We have never had problems with running these things on different threads before, and it is really not an option for us to do everything on one thread. Is this a bug in the runtime? Again, we have not been able to repeat it on previous L4T versions so it seems like a regression.