I wrote a simple CUDA program
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
__global__ void thread_kernel(int *pixel,int *poids,int *sum,int *asum,int *ret)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int n=8;
*sum=0;
*asum=0;
// for(j=0;j<8;j++){
if(i<n)
{
*sum=*sum+pixel[i]*poids[i];
*asum=*asum+poids[i];
// for(j=0;j<8;j++)
ret[i]=i;
}
}
int main()
{
int n=8;
int i;
int h_pixel[]={23,21,34,2,2,3,4,45};
int h_poids[]={20,23,56,8,7,4,9,77};
int *h_sum, *h_asum;
int *pixel;
int *poids;
int *sum;
int *asum;
int *ret,*h_ret;
dim3 grid(1,1,1);
dim3 block(8,1,1);
cudaMalloc((void**) &pixel,sizeof(int)*n);
cudaMalloc((void**) &poids,sizeof(int)*n);
cudaMalloc((void**) &sum,sizeof(int));
cudaMalloc((void**) &asum,sizeof(int));
cudaMalloc((void**) &ret,sizeof(int)*8);
h_sum=(int *)malloc(sizeof(int));
h_asum=(int *)malloc(sizeof(int));
h_ret=(int *)malloc(sizeof(int)*8);
cudaMemcpy(pixel,h_pixel,sizeof(int)*n,cudaMemcpyHostToDevice);
cudaMemcpy(poids,h_poids,sizeof(int)*n,cudaMemcpyHostToDevice);
thread_kernel<<<grid,block>>>(pixel,poids,sum,asum,ret);
cudaMemcpy(h_sum,sum,sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_asum,asum,sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_ret,ret,sizeof(int)*8,cudaMemcpyDeviceToHost);
printf("sum=%d\nasum=%d\n",*h_sum,*h_asum);
for(i=0;i<8;i++) printf("%d\n",h_ret[i]);
cudaFree(pixel);
cudaFree(poids);
cudaFree(sum);
cudaFree(asum);
cudaFree(ret);
free(h_sum);
free(h_asum);
free(h_ret);
}
but the result is:
sum=3465
asum=77
0
1
2
3
4
5
6
7
“sum” is just the product of the last element of “pixels” and “poids”, “asum” is the sum of the last element of “pixel” and “poids”, but what i need is that the sum of all the product of “pixel” and “poids”. I don’t know why…
and another question is that if I change the "dim3 grid(1,1,1); dim3 block(8,1,1); " to " dim3 grid(n/3+n%3==0?0:1,1,1); dim3 block(3,1,1) " , the result becomes
sum=1904
asum=56
0
1
2
3
4
5
6
7
“sum” is just the product of the third element of “pixel” and “poids”, “asum” is also the sum of the third element of “pixel” and “poids”
I really have no idea…