suppose there are 2 device;
while(someCond())
{
for (dev = 0; dev < 2; dev++)
{
cudaSetDevice(dev);
launch_kernel<<<grid, block, 0, stream[dev]>>>();
}
cudeDeviceSyncronize();
someCommunicationBtwDevice0andDevice1(); //after every iteration i have to do this communication
}
now the question is at every iteration i have to call “cudaSetDevice()” its a very slow takes around 320ms. is there a way, so i need not call “cudaSetDevice()” at every iteration.