Hi all,
I’m trying to run long running kernels on multiple devices in parallel, however I’m seeing a serialization of these kernels (i.e. the kernel to be run on device 1 waits for the kernel on device 0 to complete before starting). The link below shows the profiler output, illustrating this behavior.
I’ve built a very simple application which generated the profile output to demonstrate this behavior, included below. Can anyone explain why this could be happening? The CUDA docs state that there is no implied synchronization in cudaSetDevice, so I don’t understand why there should be any in this scenario.
Thanks for any pointers!
__global__ void kernel(int *in)
{
int i;
for(i = 0; i < 100000000; i++) {
atomicAdd(in+threadIdx.x, 1);
}
}
int main(int argc, char **argv)
{
int *d_in;
int *d_in2;
cudaSetDevice(0);
cudaMalloc((void **)&d_in, sizeof(int) * 256);
cudaMemset(d_in,0x00,sizeof(int)*256);
cudaSetDevice(1);
cudaMalloc((void **)&d_in2, sizeof(int) * 256);
cudaMemset(d_in2,0x00,sizeof(int)*25666666);
cudaSetDevice(0);
kernel<<<1,256>>>(d_in);
cudaSetDevice(1);
kernel<<<1,256>>>(d_in2);
cudaSetDevice(0);
cudaDeviceSynchronize();
cudaSetDevice(1);
cudaDeviceSynchronize();
cudaSetDevice(0);
cudaFree(d_in);
cudaSetDevice(1);
cudaFree(d_in);
}