Thanks txbob for your valuable inputs. I tried to run the block reduction approach for sample dataset and it works fine. To begin with I tested for ngroups = 1 and groupsz = 441.

To measure the kernel run time, I use cudaEventRecord() API as below. Using this approach, the

run time is measured as 0.207648 ms.

[b]

But for the example code above, when i ran on Tesla K20C card I got the results below:

441,441,441,441,441,

thrust time: 0.000807 seconds

441,441,441,441,441,

kernel time: 0.000396 seconds[/b]

I would like to know the reason behind this sudden increase in run time. Any suggestions shall be useful.

Thanks.

const int ngroups = 1;

const int groupsz = 441;

const int halfblock = 256; // choose as the minimum power of 2 greater than or equal to half of the groupsz

// for this kernel we assume that we launch with block size (threads per block) == groupsz

**global** void blockReduction(double *out, double *in){

**shared** double sdata[groupsz];

sdata[threadIdx.x] = in[threadIdx.x + blockIdx.x*groupsz];

__syncthreads();

for (int i = halfblock; i>0; i>>=1){

if ((threadIdx.x < i) && (threadIdx.x+i < groupsz))

sdata[threadIdx.x] += sdata[threadIdx.x+i];

__syncthreads();}

if (!threadIdx.x) out[blockIdx.x] = sdata[0];

}

int main()

{

FILE *A_f = fopen("/home/…/file.txt", “r”);

int i;

double *h_A = (double*)malloc(groupsz* sizeof(double));

double *h_sum = (double*)malloc(ngroups* sizeof(double));

if (A_f == NULL) { return 1; }

for (i = 0;i<groupsz;++i) {

fscanf(A_f, “%lf”, &h_A[i]);

}

for (i = 0;i<groupsz;++i) {

printf("\n h_A[%d]=%0.15f \n",i,h_A[i]);

}

for (i = 0;i<ngroups;++i) {

h_sum[i]=0.0f;

}

double *d_A, **d_sum;*

cudaMalloc(&d_A, groupsz sizeof(double));

cudaMalloc(&d_sum, ngroups sizeof(double));

cudaMemcpy(d_A, h_A, groupsz* sizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(d_sum, h_sum, ngroups* sizeof(double), cudaMemcpyHostToDevice);

cudaEvent_t start, stop;

cudaEventCreate(&start);

cudaEventCreate(&stop);

float elapsedTime1=0.0f;

// GPU sum

cudaEventRecord(start, 0);

blockReduction<<<ngroups, groupsz>>> (d_sum,d_A);

cudaDeviceSynchronize();

cudaEventRecord(stop, 0);

cudaEventSynchronize(stop);

cudaEventElapsedTime(&elapsedTime1 , start, stop);

printf("\n\n T1: Measured Time for sum of a sub-image = %f ms\n\n", elapsedTime1);

cudaMemcpy(h_sum,d_sum, ngroups* sizeof(double), cudaMemcpyDeviceToHost);

for (i = 0;i<ngroups;++i) {

printf("\n h_sum[%d]=%0.15f \n",i,h_sum[i]);

}

return 0;

}