Thanks txbob for your valuable inputs. I tried to run the block reduction approach for sample dataset and it works fine. To begin with I tested for ngroups = 1 and groupsz = 441.
To measure the kernel run time, I use cudaEventRecord() API as below. Using this approach, the
run time is measured as 0.207648 ms.
[b]
But for the example code above, when i ran on Tesla K20C card I got the results below:
441,441,441,441,441,
thrust time: 0.000807 seconds
441,441,441,441,441,
kernel time: 0.000396 seconds[/b]
I would like to know the reason behind this sudden increase in run time. Any suggestions shall be useful.
Thanks.
const int ngroups = 1;
const int groupsz = 441;
const int halfblock = 256; // choose as the minimum power of 2 greater than or equal to half of the groupsz
// for this kernel we assume that we launch with block size (threads per block) == groupsz
global void blockReduction(double *out, double *in){
shared double sdata[groupsz];
sdata[threadIdx.x] = in[threadIdx.x + blockIdx.x*groupsz];
__syncthreads();
for (int i = halfblock; i>0; i>>=1){
if ((threadIdx.x < i) && (threadIdx.x+i < groupsz))
sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
if (!threadIdx.x) out[blockIdx.x] = sdata[0];
}
int main()
{
FILE *A_f = fopen(“/home/…/file.txt”, “r”);
int i;
double h_A = (double)malloc(groupsz* sizeof(double));
double h_sum = (double)malloc(ngroups* sizeof(double));
if (A_f == NULL) { return 1; }
for (i = 0;i<groupsz;++i) {
fscanf(A_f, “%lf”, &h_A[i]);
}
for (i = 0;i<groupsz;++i) {
printf(“\n h_A[%d]=%0.15f \n”,i,h_A[i]);
}
for (i = 0;i<ngroups;++i) {
h_sum[i]=0.0f;
}
double d_A, d_sum;
cudaMalloc(&d_A, groupsz sizeof(double));
cudaMalloc(&d_sum, ngroups sizeof(double));
cudaMemcpy(d_A, h_A, groupsz* sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_sum, h_sum, ngroups* sizeof(double), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsedTime1=0.0f;
// GPU sum
cudaEventRecord(start, 0);
blockReduction<<<ngroups, groupsz>>> (d_sum,d_A);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime1 , start, stop);
printf(“\n\n T1: Measured Time for sum of a sub-image = %f ms\n\n”, elapsedTime1);
cudaMemcpy(h_sum,d_sum, ngroups* sizeof(double), cudaMemcpyDeviceToHost);
for (i = 0;i<ngroups;++i) {
printf(“\n h_sum[%d]=%0.15f \n”,i,h_sum[i]);
}
return 0;
}