I coded the math function called axpy and I have compared the timing of my version and the version of cublas. Doing and optimizing it I have seen that the function is slower (3 times) when I compiled with O3 or O2 than when I compiled with O1 or O0. This optimization it’s for -ptxa compiler. So I will show u the code, the timing with 2 version the compile lines and the ptxa code
I think there is an error, what do u think??
axpy code:
...
template <unsigned int oper, unsigned int res, unsigned int numblock>
__global__ void My_Daxpy (TIPO alpha, TIPO *vec1, TIPO *vec2){
__shared__ long int inc_bloc;
long int pos;
TIPO *ptr_vec;
if (numblock > MAX_BLOCK_AXE){
if (threadIdx.x == 0 && threadIdx.y == 0)
inc_bloc = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x;
}
else{
if (threadIdx.x == 0)
inc_bloc = blockIdx.x * blockDim.x;
}
__syncthreads();
pos = (inc_bloc + threadIdx.x);
if (oper == ADD){
if(res == VECT1){
ptr_vec = vec1 + pos;
*ptr_vec = *ptr_vec * alpha + vec2[pos];
// vec1[pos] = vec1[pos] * alpha + vec2[pos];
}
else if(res == VECT2){
ptr_vec = vec2 + pos;
*ptr_vec = vec1[pos] * alpha + *ptr_vec;
// vec2[pos] = vec1[pos] * alpha + vec2[pos];
}
else{
printf ("error: opcion de resultado no valida %d \n", res);
}
}
else if (oper == SUB){
if (res == VECT1){
ptr_vec = vec1 + pos;
*ptr_vec = *ptr_vec * alpha - vec2[pos];
// vec1[pos] = vec1[pos] * alpha - vec2[pos];
}
else if(res == VECT2){
ptr_vec = vec2 + pos;
*ptr_vec = vec1[pos] * alpha - *ptr_vec;
// vec2[pos] = vec1[pos] * alpha- vec2[pos];
}
else{
printf ("error: opcion de resultado no valida %d \n", res);
}
}
else{
printf ("error: opcion de operacion no valida %d \n", oper);
}
}
...
ptxa vO2 code:
.entry _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_ (
.param .f64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha,
.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1,
.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2)
{
.reg .u32 %r<19>;
.reg .u64 %rd<13>;
.reg .f64 %fd<6>;
.reg .pred %p<3>;
.shared .s64 __cuda_local_var_34816_36_non_const_inc_bloc;
// __cuda_local_var_34818_12_non_const_pos = 16
.loc 17 115 0
$LDWbegin__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:
mov.u32 %r1, %tid.x;
mov.u32 %r2, %tid.y;
mov.u32 %r3, 0;
set.eq.u32.u32 %r4, %r2, %r3;
neg.s32 %r5, %r4;
mov.u32 %r6, 0;
set.eq.u32.u32 %r7, %r1, %r6;
neg.s32 %r8, %r7;
and.b32 %r9, %r5, %r8;
mov.u32 %r10, 0;
setp.eq.s32 %p1, %r9, %r10;
@%p1 bra $Lt_0_5634;
.loc 17 123 0
mov.u32 %r11, %ntid.x;
mov.u32 %r12, %ctaid.x;
mov.u32 %r13, %nctaid.x;
mov.u32 %r14, %ctaid.y;
mul.lo.u32 %r15, %r13, %r14;
add.u32 %r16, %r12, %r15;
mul.lo.u32 %r17, %r11, %r16;
cvt.u64.u32 %rd1, %r17;
st.shared.s64 [__cuda_local_var_34816_36_non_const_inc_bloc], %rd1;
$Lt_0_5634:
.loc 17 129 0
bar.sync 0;
.loc 17 131 0
ld.shared.s64 %rd2, [__cuda_local_var_34816_36_non_const_inc_bloc];
cvt.u64.u32 %rd3, %r1;
add.s64 %rd4, %rd2, %rd3;
mov.s64 %rd5, %rd4;
.loc 17 141 0
mov.s64 %rd6, %rd5;
mul.lo.u64 %rd7, %rd6, 8;
ld.param.u64 %rd8, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2];
add.u64 %rd9, %rd8, %rd7;
ldu.global.f64 %fd1, [%rd9+0];
ld.param.u64 %rd10, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1];
add.u64 %rd11, %rd10, %rd7;
ldu.global.f64 %fd2, [%rd11+0];
ld.param.f64 %fd3, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha];
mad.rn.f64 %fd4, %fd2, %fd3, %fd1;
st.global.f64 [%rd9+0], %fd4;
.loc 17 166 0
exit;
$LDWend__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:
} // _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_
...
...
...
code for sm_21
Function : _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x84001c042c000000*/ S2R R0, SR_Tid_X;
/*0010*/ /*0xfc01dc03190e0000*/ ISETP.EQ.U32.AND P0, pt, R0, RZ, pt;
/*0018*/ /*0x88009c042c000000*/ S2R R2, SR_Tid_Y;
/*0020*/ /*0xfc21dc0319000000*/ ISETP.EQ.U32.AND P0, pt, R2, RZ, P0;
/*0028*/ /*0xc000a1e740000000*/ @!P0 BRA.U 0x60;
/*0030*/ /*0x940080042c000000*/ @P0 S2R R2, SR_CTAid_X;
/*0038*/ /*0x9800c0042c000000*/ @P0 S2R R3, SR_CTAid_Y;
/*0040*/ /*0x5030800320044000*/ @P0 IMAD.U32.U32 R2, R3,c[0x0][0x14],R2;
/*0048*/ /*0xfc00c1e428000000*/ @P0 MOV R3, RZ;
/*0050*/ /*0x2020800350004000*/ @P0 IMUL.U32.U32 R2, R2, c [0x0] [0x8];
/*0058*/ /*0x03f080a5c9000000*/ @P0 STS.64 [0x0], R2;
/*0060*/ /*0xffffdc0450ee0000*/ BAR.RED.POPC RZ, RZ;
/*0068*/ /*0x20015de218000000*/ MOV32I R5, 0x8;
/*0070*/ /*0x03f09ca5c1000000*/ LDS.64 R2, [0x0];
/*0078*/ /*0x00201c0348010000*/ IADD R0.CC, R2, R0;
/*0080*/ /*0x74011c035800c000*/ SHR.U32.W R4, R0, 0x1d;
/*0088*/ /*0xfc309c4348000000*/ IADD.X R2, R3, RZ;
/*0090*/ /*0xa0021c03200b8000*/ IMAD.U32.U32 R8.CC, R0,R5,c[0x0] [0x28];
/*0098*/ /*0x1020dc6340000000*/ ISCADD R3, R2, R4, 0x3;
/*00a0*/ /*0xb0325c4348004000*/ IADD.X R9, R3, c [0x0] [0x2c];
/*00a8*/ /*0x83f29c8614000000*/ LDC R10, c [0x0] [0x20];
/*00b0*/ /*0xc0009c03200b8000*/ IMAD.U32.U32 R2.CC, R0, R5,c[0x0][0x30];
/*00b8*/ /*0xd030dc4348004000*/ IADD.X R3, R3, c [0x0] [0x34];
/*00c0*/ /*0x00811ca58c000000*/ LDU.E.64 R4, [R8];
/*00c8*/ /*0x00219ca58c000000*/ LDU.E.64 R6, [R2];
/*00d0*/ /*0x93f2dc8614000000*/ LDC R11, c [0x0] [0x24];
/*00d8*/ /*0x28411c01200c0000*/ DFMA R4, R4, R10, R6;
/*00e0*/ /*0x00211ca594000000*/ ST.E.64 [R2], R4;
/*00e8*/ /*0x00001de780000000*/ EXIT;
......................................................
ptxa vO1 code:
.entry _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_ (
.param .f64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha,
.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1,
.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2)
{
.reg .u32 %r<19>;
.reg .u64 %rd<18>;
.reg .f64 %fd<6>;
.reg .pred %p<3>;
.shared .s64 __cuda_local_var_34816_36_non_const_inc_bloc;
.local .align 8 .b8 __cuda___cuda__temp__valist_array_41_328[4];
.param .u64 __cudaparma1_vprintf;
.param .u64 __cudaparma2_vprintf;
// __cuda_local_var_34818_12_non_const_pos = 16
// __cuda_local_var_34819_11_non_const_ptr_vec = 24
// _temp__valist_args_42 = 40
.loc 17 115 0
$LDWbegin__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:
.loc 17 122 0
mov.u32 %r1, %tid.x;
mov.u32 %r2, 0;
set.eq.u32.u32 %r3, %r1, %r2;
neg.s32 %r4, %r3;
mov.u32 %r5, %tid.y;
mov.u32 %r6, 0;
set.eq.u32.u32 %r7, %r5, %r6;
neg.s32 %r8, %r7;
and.b32 %r9, %r4, %r8;
mov.u32 %r10, 0;
setp.eq.s32 %p1, %r9, %r10;
@%p1 bra $L_0_5890;
.loc 17 123 0
mov.u32 %r11, %ntid.x;
mov.u32 %r12, %ctaid.x;
mov.u32 %r13, %nctaid.x;
mov.u32 %r14, %ctaid.y;
mul.lo.u32 %r15, %r13, %r14;
add.u32 %r16, %r12, %r15;
mul.lo.u32 %r17, %r11, %r16;
cvt.u64.u32 %rd1, %r17;
st.shared.s64 [__cuda_local_var_34816_36_non_const_inc_bloc], %rd1;
$L_0_5890:
$L_0_5122:
.loc 17 129 0
bar.sync 0;
.loc 17 131 0
ld.shared.s64 %rd2, [__cuda_local_var_34816_36_non_const_inc_bloc];
cvt.u64.u32 %rd3, %tid.x;
add.s64 %rd4, %rd2, %rd3;
mov.s64 %rd5, %rd4;
$L_0_6914:
.loc 17 140 0
ld.param.u64 %rd6, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2];
mov.s64 %rd7, %rd5;
mul.lo.u64 %rd8, %rd7, 8;
add.u64 %rd9, %rd6, %rd8;
mov.s64 %rd10, %rd9;
.loc 17 141 0
mov.s64 %rd11, %rd10;
ld.global.f64 %fd1, [%rd11+0];
ld.param.u64 %rd12, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1];
mov.s64 %rd13, %rd5;
mul.lo.u64 %rd14, %rd13, 8;
add.u64 %rd15, %rd12, %rd14;
ld.global.f64 %fd2, [%rd15+0];
ld.param.f64 %fd3, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha];
mad.rn.f64 %fd4, %fd2, %fd3, %fd1;
mov.s64 %rd16, %rd10;
st.global.f64 [%rd16+0], %fd4;
$L_0_7682:
$L_0_6146:
.loc 17 166 0
exit;
$LDWend__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:
} // _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_
...
...
...
code for sm_21
Function : _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x00001de440000000*/ NOP CC.T;
/*0010*/ /*0x20105d034800c000*/ IADD R1, R1, -0x8;
/*0018*/ /*0x88009c042c000000*/ S2R R2, SR_Tid_Y;
/*0020*/ /*0xfc21dc03190e0000*/ ISETP.EQ.U32.AND P0, pt, R2, RZ, pt;
/*0028*/ /*0x84001c042c000000*/ S2R R0, SR_Tid_X;
/*0030*/ /*0xfc01dc0319000000*/ ISETP.EQ.U32.AND P0, pt, R0, RZ, P0;
/*0038*/ /*0xc000a1e740000000*/ @!P0 BRA.U 0x70;
/*0040*/ /*0xfc00c1e428000000*/ @P0 MOV R3, RZ;
/*0048*/ /*0x940000042c000000*/ @P0 S2R R0, SR_CTAid_X;
/*0050*/ /*0x980080042c000000*/ @P0 S2R R2, SR_CTAid_Y;
/*0058*/ /*0x5020000320004000*/ @P0 IMAD.U32.U32 R0, R2, c[0x0][0x14],R0;
/*0060*/ /*0x2000800350004000*/ @P0 IMUL.U32.U32 R2, R0, c [0x0] [0x8];
/*0068*/ /*0x03f080a5c9000000*/ @P0 STS.64 [0x0], R2;
/*0070*/ /*0xffffdc0450ee0000*/ BAR.RED.POPC RZ, RZ;
/*0078*/ /*0x03f09ca5c1000000*/ LDS.64 R2, [0x0];
/*0080*/ /*0x20015de218000000*/ MOV32I R5, 0x8;
/*0088*/ /*0x84001c042c000000*/ S2R R0, SR_Tid_X;
/*0090*/ /*0x00201c0348010000*/ IADD R0.CC, R2, R0;
/*0098*/ /*0x83f29c8614000000*/ LDC R10, c [0x0] [0x20];
/*00a0*/ /*0x74011c035800c000*/ SHR.U32.W R4, R0, 0x1d;
/*00a8*/ /*0xfc309c4348000000*/ IADD.X R2, R3, RZ;
/*00b0*/ /*0xa0021c03200b8000*/ IMAD.U32.U32 R8.CC, R0, R5,c[0x0] [0x28];
/*00b8*/ /*0x1020dc6340000000*/ ISCADD R3, R2, R4, 0x3;
/*00c0*/ /*0xb0325c4348004000*/ IADD.X R9, R3, c [0x0] [0x2c];
/*00c8*/ /*0x93f2dc8614000000*/ LDC R11, c [0x0] [0x24];
/*00d0*/ /*0xc0009c03200b8000*/ IMAD.U32.U32 R2.CC, R0, R5, c[0x0][0x30];
/*00d8*/ /*0xd030dc4348004000*/ IADD.X R3, R3, c [0x0] [0x34];
/*00e0*/ /*0x00811ca584000000*/ LD.E.64 R4, [R8];
/*00e8*/ /*0x00219ca584000000*/ LD.E.64 R6, [R2];
/*00f0*/ /*0x28411c01200c0000*/ DFMA R4, R4, R10, R6;
/*00f8*/ /*0x00211ca594000000*/ ST.E.64 [R2], R4;
/*0100*/ /*0x00001de780000000*/ EXIT;
......................................................
compiling line & timing version O2
>>nvcc -Xcompiler -Wall -arch=sm_21 daxpy.c -lcublas -o axpy_02 -x cu -Xopencc -O2
>>./axpy_02
::::::::::Result CUBLAS::::::::::
Ejecucion: OK
Resultado: OK
Tiempo: [0:0:2-40] [hh:mm:sec-msec]
::::::::::::::::::::::::::::::::::::
:::::::::::Result CUDA:::::::::::
Ejecucion: OK
Resultado: OK
Tiempo: [0:0:8-166] [hh:mm:sec-msec]
::::::::::::::::::::::::::::::::::::
compiling line & timing version O1
>>nvcc -Xcompiler -Wall -arch=sm_21 daxpy.c -lcublas -o axpy_01 -x cu -Xopencc -O1
>>./axpy_02
::::::::::Resultado CUBLAS::::::::::
Ejecucion: OK
Resultado: OK
Time: [0:0:2-57] [hh:mm:sec-msec]
::::::::::::::::::::::::::::::::::::
:::::::::::Resultado CUDA:::::::::::
Ejecucion: OK
Resultado: OK
Time: [0:0:2-24] [hh:mm:sec-msec]
::::::::::::::::::::::::::::::::::::
How can I Know which flags are used in each optimization level??? becase I didn’t found any information about it in help message nvopencc