I have a question regarding the possible communication with a long running kernel.
Suppose I have gpu threads running that just check if the value of a variable in global memory is equal to something.
Can I signal the threads to do some work by changing the relevant value in memory using cudaMemcpyAsync in another stream?
I have put together a toy example where I try to do just that, but without success:
#include <iostream>
#include <chrono>
#include <thread>
#include "helper_cuda.h"
int* flag;
int* flag_gpu;
__global__ void start_threads(int* flag_addr){
int global_id = blockDim.x * blockIdx.x + threadIdx.x;
volatile int flag = *flag_addr;
while(flag != 42);
printf("Thread[%u] doing work.\n", global_id);
}
int main(int argc, char const *argv[]){
checkCudaErrors(cudaSetDevice(0));
cudaStream_t stream1,stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
checkCudaErrors(cudaMallocHost(&flag, sizeof(int)));
checkCudaErrors(cudaMalloc(&flag_gpu, sizeof(int)));
// set flag to 0
*flag = 0;
checkCudaErrors(cudaMemcpyAsync(flag_gpu, flag, sizeof(int), cudaMemcpyHostToDevice,stream1));
start_threads<<<1,32,0,stream1>>>(flag_gpu);
std::cout << "Started threads in stream 1.\n";
checkCudaErrors(cudaGetLastError());
std::this_thread::sleep_for(std::chrono::milliseconds(2000));
// set flag to 42
*flag = 42;
checkCudaErrors(cudaMemcpyAsync(flag_gpu, flag, sizeof(int), cudaMemcpyHostToDevice,stream2));
std::cout << "Wrote to flag in stream 2.\n";
cudaStreamSynchronize(stream1);
}
I have some questions about this scenario:
-
It seems to me like the threads will never see the change in the variable, is that the case here / in general?
-
Would I be able to reduce the time overhead of thread creation or any other overheads
by letting the threads always run “in the background” and just signaling them if there is work available?
Thanks in advance!