I ran into this weird hang where if I do the following in a loop on the same stream, it will run into a hang:
- an empty kernel on this stream
- cudaLaunchKernel on this stream that waits for an host event to be signaled
Eventually I will signal all the host events to let all the cudaLaunchKernel to be unblocked. However the code doesn’t even get to that stage, but instead hangs at some noop kernel launch when we do a lot of iterations(gdb shows the hang is inside cudaLaunchKernel itself)
Below is a simple program to reproduce this problem. For me when num_events is 50 the program still runs fine and completes. But when I try something like 60 or more it will hang.
This is run on CUDA11.3, A100 GPU
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>
#include "cuda_runtime.h"
#include "errorcheck.h"
// empty kernel
__global__ void NoOpKernel() {}
// for blocking stream to wait for host signal
class Event {
private:
std::mutex mtx_condition_;
std::condition_variable condition_;
bool signalled = false;
public:
void Signal() {
{
std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
signalled = true;
}
condition_.notify_all();
}
void Wait() {
std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
while (!signalled) {
condition_.wait(lock);
}
}
};
void CUDART_CB block_op_host_fn(void* arg) {
Event* evt = (Event*)arg;
evt->Wait();
}
int main() {
cudaStream_t stream;
CUDACHECK(cudaStreamCreate(&stream));
int num_events = 60; // 50 is okay, 60 will hang
std::vector<std::shared_ptr<Event>> event_vec;
for (int i = 0; i < num_events; i++) {
std::cout << "Queuing NoOp " << i << std::endl;
NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
std::cout << "Queued NoOp " << i << std::endl;
event_vec.push_back(std::make_shared<Event>());
cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());
std::cout << "Queued block_op " << i << std::endl;
}
for (int i = 0; i < num_events; i++) {
event_vec[i]->Signal();
}
// clean up
CUDACHECK(cudaDeviceSynchronize());
CUDACHECK(cudaStreamDestroy(stream));
return 0;
}