I am writing an algorithm that needs to execute its tasks in a specific order. To work within this restriction, I have been calling my kernel with 1 block and X threads. I then just increment a counter in the kernel to deal with the fact that I have fewer threads than vector elements.
In some tests, I found that if I up the number of blocks to allow for 1 thread per element, the computation time is cut nearly in half (obviously…). Is there anyway to order the blocks to execute sequentially?
you can use atomicAdd() in B.11 of CUDA programming guide.
for example
__global__ void foo( int *d_atomicBlockID, int num_blocks,...)
{
__shared__ int bid_smem ;
int tid = threadIdx.x ;
if ( 0 == tid ){
bid_smem = atomicAdd( d_atomicBlockID, 1 ) ;
}
__syncthreads(); // necessary due to bid_smem
int bid = bid_smem; // all threads in a thread block focus on block bid
if (bid >= num_blocks ){
return ;
}
// do your computation on block bid
}
int main(void)
{
cudaError_t status ;
int *d_atomicBlockID ;
status = cudaMalloc((void**)&d_atomicBlockID, sizeof(int));
assert( cudaSuccess == status );
status = cudaMemset(d_atomicBlockID, 0, sizeof(int));
assert( cudaSuccess == status );
..... // prepare other data
foo<<< grid, threads>>>( d_atomicBlockID, num_blocks, ...)
status = cudaGetLastError();
assert( cudaSuccess == status );
cudaThreadExit();
return 0 ;
}