Reduction operation returns incorrect result


The sum of an array (with 1681 elements) is computed using reduction approach. This approach works fine for arrays with number of elements less than 1024. But when, the number of elements is >1024 then it gives wrong results. The code snippet is provided below and the sample array is attached herewith. Can anyone suggest how to resolve this problem?

//const int N1 = 25;
const int N1 = 1681;
const int BLOCKS = 1;
const int halfblock = 512;

void global Compute_Sum(double * const out1,
double const * const in){
shared double sdata[N1];
sdata[threadIdx.x] = in[threadIdx.x + blockIdx.x*N1];
for (int i = halfblock; i>0; i>>=1){
if ((threadIdx.x < i) && (threadIdx.x+i < N1))
sdata[threadIdx.x] += sdata[threadIdx.x+i];
if (!threadIdx.x){
out1[blockIdx.x] = sdata[0];

int main(int argc,char **argv) {

//FILE *f1=fopen("data_25.txt","r");
FILE *f1=fopen("data_1681.txt","r");
	double *h_data_in = (double*) malloc(N1*sizeof(double));
double *h_sum = ((double*) malloc(BLOCKS*sizeof(double)));

if(f1==NULL) return 1;
for(int i = 0; i < N1; ++i) {fscanf(f1, "%lf",&h_data_in[i]);}
    h_sum[0] = 0.0f;
for (int jj = 0; jj < N1; jj++){printf("\n h_data_in[%d] = %0.15f \n",jj,h_data_in[jj]);}

    double *d_data_in;
double *d_sum;

cudaMalloc((void **) &d_data_in,N1*sizeof(double));
cudaMalloc((void **) &d_sum,BLOCKS*sizeof(double));

  	Compute_Sum <<<BLOCKS,N1>>> (d_sum,d_data_in); 
cudaMemcpy(h_sum, d_sum,BLOCKS*sizeof(double), cudaMemcpyDeviceToHost);
printf("\n Sum of %d elements = %0.15f \n",N1,h_sum[0]);

return 0;



data_1681.txt (17.4 KB)

I suggest using proper CUDA error checking (google that), and run your code with cuda-memcheck before asking for help here.

In CUDA it is illegal to have more than 1024 threads per block:

const int N1 = 1681;
Compute_Sum <<<BLOCKS,N1>>> (d_sum,d_data_in);