It works !
No need to divided to be 4 blocks
__global__ void nupspin(double **A, int *a)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if(A[x][y] == 5){
atomicAdd(a,1);
__syncthreads();
}
}
Result
============GPU===============
0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0
1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0
2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0
3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0
4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0
5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0
6.0 7.0 8.0 9.0 10.0 11.0 12.0 13.0
7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0
We have 5, 6 elements.
:]