How to make a kernel's execution wait for a signal from another thread

tee cuda_event_test.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>
#include <unistd.h>
#include <stdlib.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
            exit(EXIT_FAILURE); \
        } \
    } while (0)

__global__ void dummyKernel(float *data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    atomicAdd(&data[idx], idx);
    if(idx==0)
    {
        printf("dummyKernel run\n");
    }
}

int main() {
    int devID0 = 0;
    int block_size=32;
    size_t dataSize = block_size * sizeof(float);
    float *data0_dev;

    CUDA_CHECK(cudaSetDevice(devID0));
    CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));
    
    cudaStream_t stream0;
    CUDA_CHECK(cudaStreamCreate(&stream0));
    
    cudaEvent_t event;
    CUDA_CHECK(cudaEventCreate(&event));

    std::thread t([&]() {
            sleep(10);
            CUDA_CHECK(cudaEventRecord(event, stream0)); 
            printf("cudaEventRecord\n");
        });
    
    CUDA_CHECK(cudaStreamWaitEvent(stream0, event, 0));
    dummyKernel<<<1, block_size,0,stream0>>>(data0_dev); 

    CUDA_CHECK(cudaStreamSynchronize(stream0));
    CUDA_CHECK(cudaFree(data0_dev));
    t.join();
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o cuda_event_test cuda_event_test.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./cuda_event_test

Please be a bit more clear:

“Make a kernel’s execution wait”, you mean the whole kernel should wait or one thread within the kernel should wait?

“from another thread”, you mean another CPU thread or another GPU thread within that kernel or another GPU thread within another simultaneous kernel?

the whole dummyKernel should wait thread ‘t’ done

You could just use t.join() before invoking the dummyKernel?

A few ideas in addition that the one already suggested:

  1. you could use an event based system similar to what you have shown. It can’t be constructed the way you have shown, for at least 2 reasons. First, an event recorded into an “empty” stream will immediately complete. Second, cudaStreamWaitEvent can’t be issued (properly) until the event has been recorded. This effectively imposes need for additional host-based synchronization between your various threads.
  2. you could use cuStreamWaitValue32 from the driver API.
  3. you could use a callback before the kernel that you want to wait. The callback would use ordinary host-based methods to wait on a signal from the thread t that it can complete. The kernel, launched after the callback, would not begin until the callback got that signal.

Sorry, I don’t have worked example/recipes/sample codes for all these ideas right at the moment, but all of them have been discussed in forum threads, which you can find with a bit of searching.

When people are working on cross-stream dependencies, I usually suggest they see if a refactoring can allow for the stream semantics to provide the ordering they are looking for. Although that description doesn’t exactly fit the suggestion already given by curefab, its in the same vein.