Hello,
I want to measure Pure FLOPS and used the code below (16 statements with1 add+mul is fastest). I get 220 GIGAFLOPS meaning 64% of Peak perf.
On a usual CPU I get with such a simple code almost100% peak perf.!
What is wrong ?:
//====================================
global void Kernel( int nx, int tend, float* f1)
{
// number of threads
int num_threads = blockDim.x;
// Thread index
int tx = threadIdx.x;
// Block index x
int bx = blockIdx.x;
// x-Index
int x = tx + bx*num_threads;
// Block index y = y-Index
int y = blockIdx.y;
int k = nx*y + x;
int l,o;
float tmp0=0;
float tmp1=0;
float tmp2=0;
float tmp3=0;
float tmp4=0;
float tmp5=0;
float tmp6=0;
float tmp7=0;
float tmp8=0;
float tmp9=0;
float tmp10=0;
float tmp11=0;
float tmp12=0;
float tmp13=0;
float tmp14=0;
float tmp15=0;
//float tmp16=0;
//float tmp17=0;
//float tmp18=0;
//float tmp19=0;
//float tmp20=0;
//float tmp21=0;
//float tmp22=0;
//float tmp23=0;
//float tmp24=0;
//float tmp25=0;
//float tmp26=0;
//float tmp27=0;
//float tmp28=0;
//float tmp29=0;
//float tmp30=0;
//float tmp31=0;
float c0=1*0.01;
float c1=2*0.01;
float c2=3*0.01;
float c3=4*0.01;
float c4=5*0.01;
float c5=6*0.01;
float c6=7*0.01;
float c7=8*0.01;
float c8=9*0.01;
float c9=10*0.01;
float c10=11*0.01;
float c11=12*0.01;
float c12=13*0.01;
float c13=14*0.01;
float c14=15*0.01;
float c15=16*0.01;
//float c16=16*0.01;
//float c17=17*0.01;
//float c18=18*0.01;
//float c19=19*0.01;
//float c20=20*0.01;
//float c21=21*0.01;
//float c22=22*0.01;
//float c23=23*0.01;
//float c24=24*0.01;
//float c25=25*0.01;
//float c26=26*0.01;
//float c27=27*0.01;
//float c28=28*0.01;
//float c29=29*0.01;
//float c30=30*0.01;
//float c31=31*0.01;
for(l=0; l<tend ; l++){
for(o=0; o<tend ; o++){
tmp0 = tmp0*c0+c0;
tmp1 = tmp1*c1+c1;
tmp2 = tmp2*c2+c2;
tmp3 = tmp3*c3+c3;
tmp4 = tmp4*c4+c4;
tmp5 = tmp5*c5+c5;
tmp6 = tmp6*c6+c6;
tmp7 = tmp7*c7+c7;
tmp8 = tmp8*c8+c8;
tmp9 = tmp9*c9+c9;
tmp10 = tmp10*c10+c10;
tmp11 = tmp11*c11+c11;
tmp12 = tmp12*c12+c12;
tmp13 = tmp13*c13+c13;
tmp14 = tmp14*c14+c14;
tmp15 = tmp15*c15+c15;
//tmp16 = tmp16*c16+c16;
//tmp17 = tmp17*c17+c17;
//tmp18 = tmp18*c18+c18;
//tmp19 = tmp19*c19+c19;
//tmp20 = tmp20*c20+c20;
//tmp21 = tmp21*c21+c21;
//tmp22 = tmp22*c22+c22;
//tmp23 = tmp23*c23+c23;
//tmp24 = tmp24*c24+c24;
//tmp25 = tmp25*c25+c25;
//tmp26 = tmp26*c26+c26;
//tmp27 = tmp27*c27+c27;
//tmp28 = tmp28*c28+c28;
//tmp29 = tmp29*c29+c29;
//tmp30 = tmp30*c30+c30;
//tmp31 = tmp31*c31+c31;
}
}
f1[k]= tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7+
tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15; //+
//tmp16+tmp17+tmp18+tmp19+tmp20+tmp21+tmp22+tmp23+
//tmp24+tmp25+tmp26+tmp27+tmp28+tmp29+tmp30+tmp31;
}
//============================
main function:
…
CUT_SAFE_CALL(cutStartTimer(timer));
Kernel<<< grid, threads >>> ( nx, tend, f0);
CUT_SAFE_CALL(cutStopTimer(timer));
//printf("Processing time: %f (ms)\n", cutGetTimerValue(timer));
ftime = cutGetTimerValue(timer);
ftime /= 1000.0;
flops = pow(1.0*tend,2.0)*(2*16.0*nx*ny) / (1.0E9*ftime);
printf("Processing time: %f sec\n", ftime);
printf("GigaFlops: %f\n", flops);
…
//====================================