I am just a beginner to the world of CUDA.
I read that the scope of device shared variable is only block. But in my code I am observing that all the blocks can access it. The program output is 90, but I am confused why it is not 45? The 10 threads of one block will add (0+1+…+9)=45. But 20 threads from 2 different blocks can access this shared variable. Would anyone please expalin?
#include <stdio.h>
#define block 2
#define thread 10
__device__ __shared__ int sum;
__global__ static void hello(int *N)
{
int tx=threadIdx.x;
sum+=N[tx];
}
int main()
{
int data[10]={0,1,2,3,4,5,6,7,8,9};
int *num;
cudaMalloc((void**)&num,sizeof(int)*10);
cudaMemcpy(num,data,sizeof(int)*10,cudaMemcpyHostToDevice);
hello <<<block ,thread>>>(num);
printf("%d",sum);
return 0;
}