Hi,
Here’s the scenario:
1st thread performs some device operations and then calls cudaEventRecord(event); (NOTE: This event was created with flag ‘cudaEventBlockingSync’)
2nd thread waits for this event with cudaEventSynchronize(event) and then performs some CPU operations.
Code outline:
for(int i=0;i<5;i++)
cudaEventCreateWithFlags(&event[i], cudaEventBlockingSync);
omp_set_num_threads(2);
#pragma omp parallel shared(event)
{
int ti = omp_get_thread_num();
if(ti == 0)
{
for(int i=0;i<5;i++)
{
//Some device operations
cudaEventRecord(event[i]);
}
}
else
{
for(int i=0;i<5;i++)
{
cudaEventSynchronize(event[i]); //Doesn't wait till completion of event[i] :(
//Some CPU operations
}
}
}
As shown in the code, the cudaEventSynchronize(event[i]) doesn’t block the CPU thread 1 till completion of device operations done by CPU thread 0 for ‘i’ (As seen in Visual Profiler).
Can anybody tell what I might be doing wrong?
Thanks in Advance! :)