Does cudaStreamWaitEvent(stream2, event1, 0) also block the stream to record event1?

I want to achieve:

  • launch kernel 1 on stream 1 , record event 1
  • let stream 2 wait for event 1, and then launch kernel 2
  • launch kernel 3 on stream 2

The idea is I want kernel 2 and kernel 3 concurrently execute, but also want to make sure kernel 2 executes after kernel 1.

However, with the following code, I find kernel 2 and kernel 3 still executes sequentially:

Here is my code:

#include <iostream>
#include <cuda_runtime.h>

// Kernel functions to perform computation
__global__ void kernel1(int64_t *data, int64_t repeat) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    for (size_t i = 0; i < repeat; i++)
    {
        data[idx] += 1;
    }
}

__global__ void kernel2(int64_t *data, int64_t repeat) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    for (size_t i = 0; i < repeat; i++)
    {
        data[idx] += 2;
    }
}

__global__ void kernel3(int64_t *data, int64_t repeat) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    for (size_t i = 0; i < repeat; i++)
    {
        data[idx] -= 1;
    }
}

int main() {
    const int dataSize = 1024;
    const int printSize = 10;
    int64_t *h_data = new int64_t[dataSize]; // Host data
    int64_t *d_data1, *d_data2; // Device data

    // Initialize host data
    for (int i = 0; i < dataSize; i++) {
        h_data[i] = 0;
    }

    // Allocate memory on the device
    cudaMalloc((void**)&d_data1, dataSize * sizeof(int64_t));
    cudaMalloc((void**)&d_data2, dataSize * sizeof(int64_t));

    // Transfer data from host to device
    cudaMemcpy(d_data1, h_data, dataSize * sizeof(int64_t), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data2, h_data, dataSize * sizeof(int64_t), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockDim(256);
    dim3 gridDim((dataSize + blockDim.x - 1) / blockDim.x);

    // Create streams and event
    cudaStream_t stream1, stream2;
    cudaEvent_t event1;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    cudaEventCreate(&event1);

    const int64_t repeat = 1000 * 100;

    // Execute kernel1 in stream1
    kernel1<<<gridDim, blockDim, 0, stream1>>>(d_data1, repeat);
    cudaEventRecord(event1, stream1); // Record event1 after kernel1 execution in stream1

    // Execute kernel2 in stream2, waiting for event1
    cudaStreamWaitEvent(stream2, event1, 0);
    kernel2<<<gridDim, blockDim, 0, stream2>>>(d_data1, repeat);

    // Execute kernel3 in stream1 on a different array
    kernel3<<<gridDim, blockDim, 0, stream1>>>(d_data2, repeat);

    // Synchronize streams
    cudaStreamSynchronize(stream1);
    cudaStreamSynchronize(stream2);

    // Transfer data back from device to host
    cudaMemcpy(h_data, d_data1, dataSize * sizeof(int64_t), cudaMemcpyDeviceToHost);

    // Display the result for d_data1
    std::cout << "Data after kernel1 and kernel2:" << std::endl;
    for (int i = 0; i < printSize; i++) {
        std::cout << h_data[i] << " ";
    }
    std::cout << std::endl;

    // Transfer data back from device to host for d_data2
    cudaMemcpy(h_data, d_data2, dataSize * sizeof(int64_t), cudaMemcpyDeviceToHost);

    // Display the result for d_data2
    std::cout << "Data after kernel3:" << std::endl;
    for (int i = 0; i < printSize; i++) {
        std::cout << h_data[i] << " ";
    }
    std::cout << std::endl;

    // Free device memory and destroy streams and event
    cudaFree(d_data1);
    cudaFree(d_data2);
    delete[] h_data;
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    cudaEventDestroy(event1);

    return 0;
}

It seems kernel 3 is blocked by kernel 2. That’s why I ask the question: does cudaStreamWaitEvent(stream2, event1, 0) also block the stream to record event1?

First of all, even if you have satisfied all necessary conditions, CUDA provides no guarantees of any sort of concurrency.

As a practical matter, to witness kernel concurrency, you should first verify that the GPU has sufficient resources to run both kernels at the same time. If you want help with this aspect, it’s a good idea to let others know what GPU you are running on.

If you are running on CUDA 12.2 or newer, my guess would be that you are running into CUDA lazy module loading. In your test case, you call each kernel only once, so each kernel will force a device sync if lazy loading is in effect. This would prevent any sort of kernel concurrency, even if you have properly provided for it.

You could test this by running (i.e. profiling) your code with:

CUDA_MODULE_LOADING=EAGER nsys profile ...

Thanks for the reply! Yes it is caused by lazy module loading. Adding CUDA_MODULE_LOADING=EAGER helps. Thanks again!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.