The following code is forever spinning. Here is the minimal example.
int *a_h, *b_h, *a_d, *b_d;
cudaMallocHost(&a_h, sizeof(int));
cudaMallocHost(&b_h, sizeof(int));
cudaMalloc(&a_d, sizeof(int));
cudaMalloc(&b_d, sizeof(int));
cudaMemset(a_d, 0, sizeof(int));
cudaMemset(b_d, 0, sizeof(int));
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(a_d), 1,
CU_STREAM_WAIT_VALUE_GEQ);
// Some cuda kernels
cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(b_d), 2,
CU_STREAM_WRITE_VALUE_DEFAULT);
std::this_thread::sleep_for(std::chrono::seconds(1));
cudaStream_t stream2;
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
cudaStreamSynchronize(stream2);
assert(*b_h == 0);
*a_h = 1;
cudaMemcpyAsync(a_d, a_h, sizeof(int), cudaMemcpyDeviceToHost, stream2);
std::this_thread::sleep_for(std::chrono::seconds(1));
cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
cudaStreamSynchronize(stream2);
assert(*b_h == 2);
However, if I delete the kernel between cuStreamWaitValue32
and cuStreamWriteValue32
, it works as expected.
Why is this scenario?