problem of a simple CUDA program cuda

I wrote a simple CUDA program

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <math.h>

__global__ void thread_kernel(int *pixel,int *poids,int *sum,int *asum,int *ret)

{

  int i=blockIdx.x*blockDim.x+threadIdx.x;

  int n=8;

*sum=0;

  *asum=0;

// for(j=0;j<8;j++){

  if(i<n)

  {

	*sum=*sum+pixel[i]*poids[i];

	*asum=*asum+poids[i];

   // for(j=0;j<8;j++) 

	ret[i]=i;

  }

}

int main()

{

int n=8;

  int i;

  int h_pixel[]={23,21,34,2,2,3,4,45};

  int h_poids[]={20,23,56,8,7,4,9,77};

  int *h_sum, *h_asum;

int *pixel;

  int *poids;

  int *sum;

  int *asum;

  int *ret,*h_ret;

dim3 grid(1,1,1);

  dim3 block(8,1,1);

cudaMalloc((void**) &pixel,sizeof(int)*n);

  cudaMalloc((void**) &poids,sizeof(int)*n);

  cudaMalloc((void**) &sum,sizeof(int));

  cudaMalloc((void**) &asum,sizeof(int));

  cudaMalloc((void**) &ret,sizeof(int)*8);

  h_sum=(int *)malloc(sizeof(int));

  h_asum=(int *)malloc(sizeof(int));

  h_ret=(int *)malloc(sizeof(int)*8);

cudaMemcpy(pixel,h_pixel,sizeof(int)*n,cudaMemcpyHostToDevice);

  cudaMemcpy(poids,h_poids,sizeof(int)*n,cudaMemcpyHostToDevice);

thread_kernel<<<grid,block>>>(pixel,poids,sum,asum,ret);

cudaMemcpy(h_sum,sum,sizeof(int),cudaMemcpyDeviceToHost);

  cudaMemcpy(h_asum,asum,sizeof(int),cudaMemcpyDeviceToHost);

  cudaMemcpy(h_ret,ret,sizeof(int)*8,cudaMemcpyDeviceToHost);

printf("sum=%d\nasum=%d\n",*h_sum,*h_asum);

for(i=0;i<8;i++)  printf("%d\n",h_ret[i]);

cudaFree(pixel);

  cudaFree(poids);

  cudaFree(sum);

  cudaFree(asum);

  cudaFree(ret);

  free(h_sum);

  free(h_asum);

  free(h_ret);

}

but the result is:

sum=3465

asum=77

0

1

2

3

4

5

6

7

“sum” is just the product of the last element of “pixels” and “poids”, “asum” is the sum of the last element of “pixel” and “poids”, but what i need is that the sum of all the product of “pixel” and “poids”. I don’t know why…

and another question is that if I change the "dim3 grid(1,1,1); dim3 block(8,1,1); " to " dim3 grid(n/3+n%3==0?0:1,1,1); dim3 block(3,1,1) " , the result becomes

sum=1904

asum=56

0

1

2

3

4

5

6

7

“sum” is just the product of the third element of “pixel” and “poids”, “asum” is also the sum of the third element of “pixel” and “poids”

I really have no idea…

You have a race condition on sum and asum. Everything running thread of every active block will attempt to simultaneously update the values of those two variables, but one one thread per warp of thirty two threads is actually guaranteed by the hardware to update that location. The results will be totally meaningless. If you want to perform parallel summation, then you need to use a form of reduction algorithm. The SDK contains a whitepaper and about 6 different reduction implementations of differing complexity and performance.

Thanks a lot :) I will look at the SDK samples

I used the sum reduction. when I set the block_size=1,i.e there is one block, it works well. but when I use several blocks, it just works on the first block. Why?

Thanks

You need to call the kernel twice. The second time the input will be the output of the first kernel. You need to add the partial sum for each block to get the final result! Look at SDK sample Reduction for more details.