I’m hitting the same - or at least a very similar - issue with CUDA 10. Like the OP, I’m making CUDA calls in multiple threads. Unlike the OP, I’m using PyTorch. Here’re the tops of my backtraces:
Thread #1
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1 0x00007f5d2f7eb098 in __GI___pthread_mutex_lock (mutex=0x559660afa6d0) at ../nptl/pthread_mutex_lock.c:113
#2 0x00007f5cb1d98d55 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#3 0x00007f5cb1d98f2e in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#4 0x00007f5cb1f1b060 in cuLaunchKernel () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#5 0x00007f5d002d323d in ?? () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#6 0x00007f5d002d32c7 in ?? () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#7 0x00007f5d0030746b in cudaLaunchKernel () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#8 0x00007f5cd216df6d in void at::native::gpu_kernel_impl<__nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, __nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> const&), &(void at::native::gpu_kernel_with_scalars<__nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> >(at::TensorIterator&, __nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> const&)), 2u>, float (float), __nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> const, float> >(at::TensorIterator&, __nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, __nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> const&), &(void at::native::gpu_kernel_with_scalars<__nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> >(at::TensorIterator&, __nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> const&)), 2u>, float (float), __nv_hdl_wrapper_t<false, false, __nv_dl_tag<void (*)(at::TensorIterator&, c10::Scalar), &at::native::add_kernel_cuda, 4u>, float (float, float), float> const, float> const&) () from /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch.so
Thread #2
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1 0x00007f5d2f7eb098 in __GI___pthread_mutex_lock (mutex=0x559660afa6d0) at ../nptl/pthread_mutex_lock.c:113
#2 0x00007f5cb1d98d55 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#3 0x00007f5cb1d98f2e in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#4 0x00007f5cb1f1b060 in cuLaunchKernel () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#5 0x00007f5d002d323d in ?? () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#6 0x00007f5d002d32c7 in ?? () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#7 0x00007f5d0030746b in cudaLaunchKernel () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#8 0x00007f5cd1a99ce7 in void THC_transformReduceInnermostDimIndex<float, long, c10::TensorImpl, c10::TensorImpl, MaxValuePair<float, long> >(THCState*, c10::TensorImpl*, c10::TensorImpl*, c10::TensorImpl*, thrust::pair<float, long> const&, MaxValuePair<float, long>) () from /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch.so
Thread #3
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1 0x00007f5d2f7eb098 in __GI___pthread_mutex_lock (mutex=0x559660afa6d0) at ../nptl/pthread_mutex_lock.c:113
#2 0x00007f5cb1d98d55 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#3 0x00007f5cb1d98f2e in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#4 0x00007f5cb1f1b060 in cuLaunchKernel () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#5 0x00007f5d002d323d in ?? () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#6 0x00007f5d002d32c7 in ?? () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#7 0x00007f5d0030746b in cudaLaunchKernel () from /opt/conda/lib/python3.7/site-packages/torch/lib/../../../../libcudart.so.10.0
#8 0x00007f5cd2239089 in void at::native::gpu_kernel_impl<__nv_hdl_wrapper_t<false, true, __nv_dl_tag<void (*)(at::TensorIterator&), &(void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)), 1u>, float (float)> >(at::TensorIterator&, __nv_hdl_wrapper_t<false, true, __nv_dl_tag<void (*)(at::TensorIterator&), &(void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)), 1u>, float (float)> const&) () from /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch.so
This is happening in some fairly complex code so I don’t have a minimal example I’m afraid. I also can’t use cuda-gdb due to a bug. I’ll try mutexes and separate handles (if PyTorch supports that) but I suspect the answer is going to be ‘dont multithread PyTorch CUDA’.