cuStreamWaitValue32 and cuStreamWriteValue32 blocking issue

The following code is forever spinning. Here is the minimal example.

  int *a_h, *b_h, *a_d, *b_d;
  cudaMallocHost(&a_h, sizeof(int));
  cudaMallocHost(&b_h, sizeof(int));
  cudaMalloc(&a_d, sizeof(int));
  cudaMalloc(&b_d, sizeof(int));
  cudaMemset(a_d, 0, sizeof(int));
  cudaMemset(b_d, 0, sizeof(int));

  cudaStream_t stream;
  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

  cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(a_d), 1,
                      CU_STREAM_WAIT_VALUE_GEQ);
  // Some cuda kernels
  cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(b_d), 2,
                       CU_STREAM_WRITE_VALUE_DEFAULT);

  std::this_thread::sleep_for(std::chrono::seconds(1));
  cudaStream_t stream2;
  cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
  cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  cudaStreamSynchronize(stream2);
  assert(*b_h == 0);

  *a_h = 1;
  cudaMemcpyAsync(a_d, a_h, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  std::this_thread::sleep_for(std::chrono::seconds(1));

  cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  cudaStreamSynchronize(stream2);
  assert(*b_h == 2);

However, if I delete the kernel between cuStreamWaitValue32 and cuStreamWriteValue32, it works as expected.

Why is this scenario?

Looks like a lazy loading issue.

Try repeating the test running your previously failing app as follows:

CUDA_MODULE_LOADING=EAGER ./my_app

1 Like

Thanks, it solves.

Could I have a follow up question?

When I substitute cublasCgemmStridedBatched for the in-between cuda kernel, it still blocks even when I disable lazy mode. Here is the code:

  int *a_h, *b_h, *a_d, *b_d;
  cudaMallocHost(&a_h, sizeof(int));
  cudaMallocHost(&b_h, sizeof(int));
  cudaMalloc(&a_d, sizeof(int));
  cudaMalloc(&b_d, sizeof(int));
  cudaMemset(a_d, 0, sizeof(int));
  cudaMemset(b_d, 0, sizeof(int));

  cublasHandle_t blas_handle;
  cublasCreate_v2(&blas_handle);
  cuComplex *in, *buffer;
  cudaMalloc(&in, 16 * 16 * 10 * sizeof(cuComplex));
  cudaMalloc(&buffer, 16 * 16 * 10 * sizeof(cuComplex));
  int stride_in = 16 * 16, stride_out = 16 * 16;
  int lda = 16, ldb = 16, ldc = 16;
  cuComplex alpha = make_cuComplex(1, 0), beta = make_cuComplex(0, 0);

  cudaStream_t stream;
  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

  cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(a_d), 1,
                      CU_STREAM_WAIT_VALUE_GEQ);
  // Some cuda kernels
  cublasSetStream(blas_handle, stream);
  cublasCgemmStridedBatched(blas_handle, CUBLAS_OP_N, CUBLAS_OP_C, 16,
                            16, 16, &alpha, in, lda, stride_in,
                            in, ldb, stride_in, &beta, buffer, ldc,
                            stride_out, batch);
  cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(b_d), 2,
                       CU_STREAM_WRITE_VALUE_DEFAULT);

  std::this_thread::sleep_for(std::chrono::seconds(1));
  cudaStream_t stream2;
  cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
  cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  cudaStreamSynchronize(stream2);
  assert(*b_h == 0);

  *a_h = 1;
  cudaMemcpyAsync(a_d, a_h, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  std::this_thread::sleep_for(std::chrono::seconds(1));

  cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  cudaStreamSynchronize(stream2);
  assert(*b_h == 2);

Kenels other than cublasCgemmStridedBatched works fine when I was testing.

When I run the code you have now shown (with batch set to 1), I observe normal completion when I run it on CUDA 12.2, linux, L4 GPU, and with CUDA_MODULE_LOADING=EAGER.

So I’m unable to reproduce the hang under those conditions. I probably won’t be able to help further, unless I can witness the hang. At this point I might ask for things like “what CUDA version? which GPU driver version? What is your nvcc compile command line? what GPU are you running on? What OS?” Providing that data might help me, it might not. You should also probably provide a complete test case. To be clear, that is a code that I can copy, paste, compile, and run, and observe the issue, without having to add anything or change anything. The code you have provided so far is not that. I mean complete. That does not mean your whole code. It means a suitably reduced version, that is complete according to my definition, and does reproduce the issue.

As an aside, may I offer some advice? (It’s not applicable here. At the moment I’m unaware of anything wrong with the code you have now posted, and I am unable to witness the hang.)

When I am teaching CUDA concurrency with streams, after the concepts are understood and reinforced, I usually make a suggestion: “Leave the legacy stream behind. Anything you want to do, can be done purely using created streams.”

Your cublas call is being launched into the legacy default stream. You are creating other streams that are designed to overlap with the legacy default stream (cudaStreamNonBlocking). The legacy stream has lots of variations and behavior specifics and exceptions that need a very wholistic view to sort out behavior (including things like compile commands, environment variables, etc.) On the other hand, created stream behavior is generally easier to predict with a more local view of things.

If you want to avoid having to run through a complex decision tree every time you are trying to sort out a concurrency issue, my suggestion is don’t use the default stream, whether legacy or otherwise. If you follow this then you immediately never need to specify the cudaStreamNonBlocking flag. Your code becomes simpler, IMHO, and easier to understand and predict behavior.

Using only created streams, the concurrency rules are quite simple:

  1. Items issued into the same stream execute in issue order.
  2. items issued into separate streams have no ordering imposed by CUDA.

(Perhaps I am just being persnickety because I hate having to run through that decision tree myself when looking at other’s code.)

2 Likes

Thanks for your advice. I improve my code and try to upload a complete version.

#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <cassert>
#include <thread>

int main() {
  int *a_h, *b_h, *a_d, *b_d;
  cudaMallocHost(&a_h, sizeof(int));
  cudaMallocHost(&b_h, sizeof(int));
  cudaMalloc(&a_d, sizeof(int));
  cudaMalloc(&b_d, sizeof(int));
  cudaMemset(a_d, 0, sizeof(int));
  cudaMemset(b_d, 0, sizeof(int));

  cublasHandle_t blas_handle;
  cublasCreate(&blas_handle);
  cuComplex *in, *buffer;
  cudaMalloc(&in, 16 * 16 * 10 * sizeof(cuComplex));
  cudaMalloc(&buffer, 16 * 16 * 10 * sizeof(cuComplex));
  int stride_in = 16 * 16, stride_out = 16 * 16;
  int lda = 16, ldb = 16, ldc = 16;
  cuComplex alpha = make_cuComplex(1, 0), beta = make_cuComplex(0, 0);

  cudaStream_t stream;
  cudaStreamCreate(&stream);

  cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(a_d), 1,
                      CU_STREAM_WAIT_VALUE_GEQ);
  cublasSetStream(blas_handle, stream);
  cublasCgemmStridedBatched(blas_handle, CUBLAS_OP_N, CUBLAS_OP_C, 16, 16, 16,
                            &alpha, in, lda, stride_in, in, ldb, stride_in,
                            &beta, buffer, ldc, stride_out, 10);
  cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(b_d), 2,
                       CU_STREAM_WRITE_VALUE_DEFAULT);

  std::this_thread::sleep_for(std::chrono::seconds(1));
  cudaStream_t stream2;
  cudaStreamCreate(&stream2);
  cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  cudaStreamSynchronize(stream2);
  assert(*b_h == 0);

  *a_h = 1;
  cudaMemcpyAsync(a_d, a_h, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  std::this_thread::sleep_for(std::chrono::seconds(1));

  cudaMemcpyAsync(b_h, b_d, sizeof(int), cudaMemcpyDeviceToHost, stream2);
  cudaStreamSynchronize(stream2);
  assert(*b_h == 2);
}

The compilation command I used is nvcc test.cc -lcublas -lcuda. OS: Ubuntu 20.04, CUDA 12.4, driver 550.54.15, with CUDA_MODULE_LOADING=EAGER.

I try to test it on different kinds of GPUs. On Tesla V100, it works. But on NVIDIA A30, it blocks.

Is there something I need to configure just like lazy mode or simply because A30 does not support this?

Please post code inline, not as an attachment.

Sorry, Updated.

I doubt the explanation is that

however I cannot explain it at the moment. I don’t have an A30 to test on. I have tried two three different machines, and the code runs without deadlock in each case. So there might be something unique or broken in the setup of that A30 machine.

1 Like