Hello,
…
let:
// *****************************
global void cuda_sum(int *dev_A,int *dev_sum)
{
int i = blockIdx.x;
dev_sum[0]+=dev_A[i];
}
// ******************************
Why dev_sum[0] not give the sum of the elements of the dev_A.
thank you.
//+++++++++++++++++++++++++++++
// you can find a test here
//+++++++++++++++++++++++++++++
int main()
{
int A[10]={1,2,3,4,5,6,7,8,9,10};
int sum[1]={0};
int *dev_A, *dev_sum;
cudaMalloc((void**)&dev_A, 10 * sizeof(int));
cudaMemcpy(dev_A, A, 10 * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_sum, sizeof(int));
cudaMemcpy(dev_sum, sum, 1 * sizeof(int), cudaMemcpyHostToDevice);
cuda_sum<<<10,1>>>(dev_A,dev_sum); // dev_sum[0] = dev_A[0]+dev_A[0]+ ... +dev_A[9]
cudaMemcpy(sum, dev_sum, 1 * sizeof(int), cudaMemcpyDeviceToHost);
printf(" 55 ... %5d \n",sum[0]);
return 0;
}