I’m using a function I read in a blog to synchronize blocks.When the gridsize was small,my program worked well,but when the gridsize is large,gridsize over 200 in my program,the program was stuck.How can I solve this problem?Here is my code.
__device__ void __gpu_sync(int goalVal, volatile int *Arrayin, volatile int *Arrayout) // Arrayin,Arrayout:gridDim.x*grimDim.y elements
{
// thread ID in a block
int tid_in_blk = threadIdx.x;
int nBlockNum = gridDim.x * gridDim.y;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
// only thread 0 is used for synchonization
if (tid_in_blk == 0)
{
Arrayin[bid] = goalVal;
}
if (bid == 1)
{
if (tid_in_blk < nBlockNum)
{
while (Arrayin[tid_in_blk] != goalVal)
{
// Do nothing here
}
}
__syncthreads();
if (tid_in_blk < nBlockNum)
{
Arrayout[tid_in_blk] = goalVal;
}
}
if (tid_in_blk == 0)
{
while (Arrayout[bid] != goalVal)
{
// Do nothing here
}
}
__syncthreads();
}