Hello,
I use 2X9800GX2 so four GPUs(windowsXP).
I launch four threads with similar length of calculation.
I can see the lenght for each GPU(see code below), I have all in less than 0.4sec.
Howewer the “cutWaitForThreads(threadID, nb_gpu);” waits always 0.9/1.1 secs, instead of something near 0.4 secs.
I have an another machine with 2 8800GTX and where I have not this problem.
Thanks in advance. :)
static CUT_THREADPROC GpuThread(SINPUT_OUTPUT * psio)
{
clock_t start= clock();
CUDA_SAFE_CALL(cudaSetDevice(psio->index_card));
CUDA_SAFE_CALL( cudaThreadSynchronize() );
do_GPU(psio);//here is the call of the kernel and a CUDA_SAFE_CALL( cudaThreadSynchronize() ) at the end
clock_t end= clock();
psio->temps=(float)(end - start) / (CLOCKS_PER_SEC);result 0.35 sec/0.40sec :)
CUT_THREADEND;
}
main (…)
{
…
SINPUT_OUTPUT sinput_output[4]; // define a struct for each card.
CUTThread threadID[4];
int gpuIndex,nb_gpu;
CUT_DEVICE_INIT(argc,argv);
CUDA_SAFE_CALL(cudaGetDeviceCount(&nb_gpu));
clock_t begin = clock();
//Start CPU thread for each GPU
for(gpuIndex = 0; gpuIndex < nb_gpu; gpuIndex++)
threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE) GpuThread, &sinput_output[gpuIndex]);
cutWaitForThreads(threadID, nb_gpu);
clock_t end = clock();
printf("calculation in %f s\n",(float)(end - begin) / CLOCKS_PER_SEC);
//result 0.90/1.1 sec!!! :( :( :(
}