Appendix B.5 (3.2 programming guide) has some literature about synchronizing across blocks - that is, having the last block done do something. I implemented the code, but I don’t think it is working properly. I decided to have the last block print out its number - the printf statement prints out about 23-24 times, but it should be printing 33 times based on the data.
Here is my code
__device__ unsigned int count = 0;
__shared__ bool isLastBlockDone;
__global__ void U(cuDoubleComplex *a, cuDoubleComplex* b, cuDoubleComplex *c, cuDoubleComplex *newX){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int num = N-1;
while(num >= 0){
if(a[tid].py==num && a[tid].px > num){
newX[a[tid].px] = a[tid] * c[a[tid].px];
__threadfence(); // testing...
}
__syncthreads();
if(threadIdx.x==0){
__threadfence(); // code in apendix
unsigned int value = atomicInc(&count, gridDim.x);
isLastBlockDone = (value == (gridDim.x - 1));
}
__syncthreads();
if(isLastBlockDone){
b[num] = SUM(b,c,newX,num);
if(threadIdx.x==0){
c[num] = b[num];
count = 0;
printf("Block: %d\n", blockIdx.x);
}
}
__syncthreads();
num--;
}
}
I’m calling the kernel:
int nTU = 16;
int numBlocks = (int)ceil((float)ne/nTU); // ne = number of array elements
U<<<numBlocks,nTU>>>(d_a, d_b, d_c, d_newX);
The output is:
Block: 1
Block: 12
Block: 13
Block: 16
Block: 17
Block: 22
Block: 22
Block: 25
Block: 26
Block: 30
Block: 34
Block: 36
Block: 4
Block: 40
Block: 41
Block: 43
Block: 43
Block: 46
Block: 48
Block: 5
Block: 55
Block: 57
Block: 58
Block: 8
Any help would be greatly appreciated.