Hi all,
I have a molecular simulation that involves energy calculation a few thousand times per second. Since launching a kernel with a set of threads takes a lot of overhead, is it possible to start the threads at the beginning of the process and then have them pick up the input as it becomes available and then notify the host code when the processing is done?
I’m trying to do this with flags set in zero-copy memory by the host and the GPU, which indicate whether to wait, execute or exit (flags[0]). Similarly, flags[1] tells the host to pick up the results or wait. At this point, I have some sample code that doesn’t work (goes into infinite loop until Ctrl-C), but is pretty much a starting point.
Any suggestions would be very much appreciated.
Sasha
#include <iostream>
#define BLOCKS 1
#define THREADS 8
#define SIZE 10
__global__ void kernel(double* input, double* output, int size, int* flags){
while (1){
//check flags
if (flags[0] == 0) continue;
if (flags[0] == -1) break;
//flag == 1 - process input
int pass = 0;
while (1){
int step = THREADS;
int index = pass * step + threadIdx.x;
if (index >= size) break;
double v = input[index];
for (int i = 0; i < 10; i++){
v *= 2;
}
output[index] = v;
pass++;
}
//completed processing
flags[0] = 0;//reset back to wait
flags[1] = 1;//ready for the cpu to pick up
}
}
int main(){
//allocate GPU memory, set initial flags and launch the kernel
double* dev_input;
double* dev_output;
cudaMalloc((void**)&dev_input, SIZE * sizeof(double));
cudaMalloc((void**)&dev_output, SIZE * sizeof(double));
int* flags;
int* dev_flags;
cudaHostAlloc((void**)&flags, 2 * sizeof(int), cudaHostAllocMapped);
cudaHostGetDevicePointer(&dev_flags, flags, 0);
flags[0] = 0;
flags[1] = 0;
double* input = new double;
double* output = new double;
cudaMemcpy(dev_input, input, SIZE * sizeof(double), cudaMemcpyHostToDevice);
kernel<<<BLOCKS,THREADS>>>(dev_input, dev_output, SIZE, dev_flags);
//now go into a loop to use the launched threads
for (int i = 0; i < 10; i++){
//populate input data
for (int j = 0; j < SIZE; j++){
input[j] = i * j;
}
flags[1] = 0;//not done yet
flags[0] = 1;//execute command
while (1){
if (flags[1] >= 0) break;//wait for the threads to finish
sleep(10);
}
//copy results
cudaMemcpy(output, dev_output, SIZE * sizeof(double), cudaMemcpyDeviceToHost);
//check output
for (int j = 0; j < SIZE; j++){
printf("%lf\t%lf\n", input[j], output[j]);
}
}
free(input);
free(output);
cudaFree(dev_input);
cudaFree(dev_output);
cudaFree(flags);
}