Memory fence functions

Appendix B.5 (3.2 programming guide) has some literature about synchronizing across blocks - that is, having the last block done do something. I implemented the code, but I don’t think it is working properly. I decided to have the last block print out its number - the printf statement prints out about 23-24 times, but it should be printing 33 times based on the data.

Here is my code

__device__ unsigned int count = 0;

__shared__ bool isLastBlockDone;

__global__ void U(cuDoubleComplex *a, cuDoubleComplex* b, cuDoubleComplex *c, cuDoubleComplex *newX){

	int tid = blockIdx.x * blockDim.x + threadIdx.x;

	int num = N-1;

	while(num >= 0){

		if(a[tid].py==num && a[tid].px > num){

			newX[a[tid].px] = a[tid] * c[a[tid].px];

			__threadfence(); // testing...

		}

		__syncthreads();

		if(threadIdx.x==0){

			__threadfence();  // code in apendix

			unsigned int value = atomicInc(&count, gridDim.x);

			isLastBlockDone = (value == (gridDim.x - 1));	

		}

		__syncthreads();

		if(isLastBlockDone){

			b[num] = SUM(b,c,newX,num);

				

			if(threadIdx.x==0){

				c[num] = b[num];

				count = 0;

				printf("Block: %d\n", blockIdx.x);

			}

		}

		__syncthreads();

		num--;

	}

}

I’m calling the kernel:

int nTU = 16;

int numBlocks = (int)ceil((float)ne/nTU); // ne = number of array elements

U<<<numBlocks,nTU>>>(d_a, d_b, d_c, d_newX);

The output is:

Block: 1

Block: 12

Block: 13

Block: 16

Block: 17

Block: 22

Block: 22

Block: 25

Block: 26

Block: 30

Block: 34

Block: 36

Block: 4

Block: 40

Block: 41

Block: 43

Block: 43

Block: 46

Block: 48

Block: 5

Block: 55

Block: 57

Block: 58

Block: 8

Any help would be greatly appreciated.

No need to double post in different forum sections.