Hello!
I’m trying to summ all elements of array a_h with dimension arr_size using CUDA.
As a sample, all array elements are 1.0, so an answer must be exactly arr_size. But it shows just small inadequate values which depends of number of blocks per grid.
Why it could be so? I’ve tried everything including device variables etc, but it doesn’t work! Can anyone help me?
Every thread in block is executed and also block have a proper size. May be result_d pointer is different for ech block or something about it?..
Thanks very much/ External Image
The code is following:
/////////////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
//summing kernel
global void test_kernel(float *a_d, float *result_d, int array_size){
int index=blockDim.x*blockIdx.x+threadIdx.x;
if(index<array_size){
*result_d+=a_d[index];
};
};
int main(void){
int arr_size=1000;
float *a_h,*result_h,*a_d,*result_d;
//allocation of arrays and result variables in both memories
cudaMalloc(&a_d,sizeof(float)*arr_size);
cudaMalloc(&result_d,sizeof(float));
a_h=new float[arr_size];
result_h=new float;
*result_h=(float)0.0;
for(int i=0;i<arr_size;i++){
a_h[i]=(float)1.0;
};
//coping array and zero result variable to device memory
cudaMemcpy(a_d,a_h,sizeof(float)*arr_size,cudaMemcpyHostToDe
vice);
cudaMemcpy(result_d,result_h,sizeof(float),cudaMemcpyHostToD
evice);
int threads_p_block=256;
int blocks_count=arr_size/threads_p_block+1;
test_kernel<<<blocks_count,threads_p_block>>>(a_d,result_d,arr_size);
cudaMemcpy(result_h,result_d,sizeof(float),cudaMemcpyDeviceT
oHost);
printf("result=%f\n",*result_h);
return 1;
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////