# Error optimization

I coded the math function called axpy and I have compared the timing of my version and the version of cublas. Doing and optimizing it I have seen that the function is slower (3 times) when I compiled with O3 or O2 than when I compiled with O1 or O0. This optimization it’s for -ptxa compiler. So I will show u the code, the timing with 2 version the compile lines and the ptxa code

I think there is an error, what do u think??

axpy code:

``````...

template <unsigned int oper, unsigned int res, unsigned int numblock>

__global__ void My_Daxpy (TIPO alpha, TIPO *vec1, TIPO *vec2){

__shared__ long int inc_bloc;

long int pos;

TIPO *ptr_vec;

if (numblock > MAX_BLOCK_AXE){

inc_bloc = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x;

}

else{

inc_bloc = blockIdx.x * blockDim.x;

}

if(res == VECT1){

ptr_vec = vec1 + pos;

*ptr_vec = *ptr_vec * alpha + vec2[pos];

//      vec1[pos] = vec1[pos] * alpha + vec2[pos];

}

else if(res == VECT2){

ptr_vec = vec2 + pos;

*ptr_vec = vec1[pos] * alpha + *ptr_vec;

//      vec2[pos] = vec1[pos] * alpha + vec2[pos];

}

else{

printf ("error: opcion de resultado no valida %d \n", res);

}

}

else if (oper == SUB){

if (res == VECT1){

ptr_vec = vec1 + pos;

*ptr_vec = *ptr_vec * alpha - vec2[pos];

//      vec1[pos] = vec1[pos] * alpha - vec2[pos];

}

else if(res == VECT2){

ptr_vec = vec2 + pos;

*ptr_vec = vec1[pos] * alpha - *ptr_vec;

//      vec2[pos] = vec1[pos] * alpha- vec2[pos];

}

else{

printf ("error: opcion de resultado no valida %d \n", res);

}

}

else{

printf ("error: opcion de operacion no valida %d \n", oper);

}

}

...
``````

ptxa vO2 code:

``````.entry _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_ (

.param .f64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha,

.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1,

.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2)

{

.reg .u32 %r<19>;

.reg .u64 %rd<13>;

.reg .f64 %fd<6>;

.reg .pred %p<3>;

.shared .s64 __cuda_local_var_34816_36_non_const_inc_bloc;

// __cuda_local_var_34818_12_non_const_pos = 16

.loc    17      115     0

\$LDWbegin__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

mov.u32         %r1, %tid.x;

mov.u32         %r2, %tid.y;

mov.u32         %r3, 0;

set.eq.u32.u32  %r4, %r2, %r3;

neg.s32         %r5, %r4;

mov.u32         %r6, 0;

set.eq.u32.u32  %r7, %r1, %r6;

neg.s32         %r8, %r7;

and.b32         %r9, %r5, %r8;

mov.u32         %r10, 0;

setp.eq.s32     %p1, %r9, %r10;

@%p1 bra        \$Lt_0_5634;

.loc    17      123     0

mov.u32         %r11, %ntid.x;

mov.u32         %r12, %ctaid.x;

mov.u32         %r13, %nctaid.x;

mov.u32         %r14, %ctaid.y;

mul.lo.u32      %r15, %r13, %r14;

mul.lo.u32      %r17, %r11, %r16;

cvt.u64.u32     %rd1, %r17;

st.shared.s64   [__cuda_local_var_34816_36_non_const_inc_bloc], %rd1;

\$Lt_0_5634:

.loc    17      129     0

bar.sync        0;

.loc    17      131     0

ld.shared.s64   %rd2, [__cuda_local_var_34816_36_non_const_inc_bloc];

cvt.u64.u32     %rd3, %r1;

mov.s64         %rd5, %rd4;

.loc    17      141     0

mov.s64         %rd6, %rd5;

mul.lo.u64      %rd7, %rd6, 8;

ld.param.u64    %rd8, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2];

ldu.global.f64  %fd1, [%rd9+0];

ld.param.u64    %rd10, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1];

ldu.global.f64  %fd2, [%rd11+0];

ld.param.f64    %fd3, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha];

st.global.f64   [%rd9+0], %fd4;

.loc    17      166     0

exit;

\$LDWend__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

} // _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

...

...

...

code for sm_21

Function : _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

/*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];

/*0008*/     /*0x84001c042c000000*/     S2R R0, SR_Tid_X;

/*0010*/     /*0xfc01dc03190e0000*/     ISETP.EQ.U32.AND P0, pt, R0, RZ, pt;

/*0018*/     /*0x88009c042c000000*/     S2R R2, SR_Tid_Y;

/*0020*/     /*0xfc21dc0319000000*/     ISETP.EQ.U32.AND P0, pt, R2, RZ, P0;

/*0028*/     /*0xc000a1e740000000*/     @!P0 BRA.U 0x60;

/*0030*/     /*0x940080042c000000*/     @P0 S2R R2, SR_CTAid_X;

/*0038*/     /*0x9800c0042c000000*/     @P0 S2R R3, SR_CTAid_Y;

/*0040*/     /*0x5030800320044000*/     @P0 IMAD.U32.U32 R2, R3,c[0x0][0x14],R2;

/*0048*/     /*0xfc00c1e428000000*/     @P0 MOV R3, RZ;

/*0050*/     /*0x2020800350004000*/     @P0 IMUL.U32.U32 R2, R2, c [0x0] [0x8];

/*0058*/     /*0x03f080a5c9000000*/     @P0 STS.64 [0x0], R2;

/*0060*/     /*0xffffdc0450ee0000*/     BAR.RED.POPC RZ, RZ;

/*0068*/     /*0x20015de218000000*/     MOV32I R5, 0x8;

/*0070*/     /*0x03f09ca5c1000000*/     LDS.64 R2, [0x0];

/*0078*/     /*0x00201c0348010000*/     IADD R0.CC, R2, R0;

/*0080*/     /*0x74011c035800c000*/     SHR.U32.W R4, R0, 0x1d;

/*0088*/     /*0xfc309c4348000000*/     IADD.X R2, R3, RZ;

/*0090*/     /*0xa0021c03200b8000*/     IMAD.U32.U32 R8.CC, R0,R5,c[0x0] [0x28];

/*0098*/     /*0x1020dc6340000000*/     ISCADD R3, R2, R4, 0x3;

/*00a0*/     /*0xb0325c4348004000*/     IADD.X R9, R3, c [0x0] [0x2c];

/*00a8*/     /*0x83f29c8614000000*/     LDC R10, c [0x0] [0x20];

/*00b0*/     /*0xc0009c03200b8000*/     IMAD.U32.U32 R2.CC, R0, R5,c[0x0][0x30];

/*00b8*/     /*0xd030dc4348004000*/     IADD.X R3, R3, c [0x0] [0x34];

/*00c0*/     /*0x00811ca58c000000*/     LDU.E.64 R4, [R8];

/*00c8*/     /*0x00219ca58c000000*/     LDU.E.64 R6, [R2];

/*00d0*/     /*0x93f2dc8614000000*/     LDC R11, c [0x0] [0x24];

/*00d8*/     /*0x28411c01200c0000*/     DFMA R4, R4, R10, R6;

/*00e0*/     /*0x00211ca594000000*/     ST.E.64 [R2], R4;

/*00e8*/     /*0x00001de780000000*/     EXIT;

......................................................
``````

ptxa vO1 code:

``````.entry _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_ (

.param .f64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha,

.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1,

.param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2)

{

.reg .u32 %r<19>;

.reg .u64 %rd<18>;

.reg .f64 %fd<6>;

.reg .pred %p<3>;

.shared .s64 __cuda_local_var_34816_36_non_const_inc_bloc;

.local .align 8 .b8 __cuda___cuda__temp__valist_array_41_328[4];

.param .u64 __cudaparma1_vprintf;

.param .u64 __cudaparma2_vprintf;

// __cuda_local_var_34818_12_non_const_pos = 16

// __cuda_local_var_34819_11_non_const_ptr_vec = 24

// _temp__valist_args_42 = 40

.loc    17      115     0

\$LDWbegin__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

.loc    17      122     0

mov.u32         %r1, %tid.x;

mov.u32         %r2, 0;

set.eq.u32.u32  %r3, %r1, %r2;

neg.s32         %r4, %r3;

mov.u32         %r5, %tid.y;

mov.u32         %r6, 0;

set.eq.u32.u32  %r7, %r5, %r6;

neg.s32         %r8, %r7;

and.b32         %r9, %r4, %r8;

mov.u32         %r10, 0;

setp.eq.s32     %p1, %r9, %r10;

@%p1 bra        \$L_0_5890;

.loc    17      123     0

mov.u32         %r11, %ntid.x;

mov.u32         %r12, %ctaid.x;

mov.u32         %r13, %nctaid.x;

mov.u32         %r14, %ctaid.y;

mul.lo.u32      %r15, %r13, %r14;

mul.lo.u32      %r17, %r11, %r16;

cvt.u64.u32     %rd1, %r17;

st.shared.s64   [__cuda_local_var_34816_36_non_const_inc_bloc], %rd1;

\$L_0_5890:

\$L_0_5122:

.loc    17      129     0

bar.sync        0;

.loc    17      131     0

ld.shared.s64   %rd2, [__cuda_local_var_34816_36_non_const_inc_bloc];

cvt.u64.u32     %rd3, %tid.x;

mov.s64         %rd5, %rd4;

\$L_0_6914:

.loc    17      140     0

ld.param.u64    %rd6, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2];

mov.s64         %rd7, %rd5;

mul.lo.u64      %rd8, %rd7, 8;

mov.s64         %rd10, %rd9;

.loc    17      141     0

mov.s64         %rd11, %rd10;

ld.global.f64   %fd1, [%rd11+0];

ld.param.u64    %rd12, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1];

mov.s64         %rd13, %rd5;

mul.lo.u64      %rd14, %rd13, 8;

ld.global.f64   %fd2, [%rd15+0];

ld.param.f64    %fd3, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha];

mov.s64         %rd16, %rd10;

st.global.f64   [%rd16+0], %fd4;

\$L_0_7682:

\$L_0_6146:

.loc    17      166     0

exit;

\$LDWend__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

} // _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

...

...

...

code for sm_21

Function : _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

/*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];

/*0008*/     /*0x00001de440000000*/     NOP CC.T;

/*0010*/     /*0x20105d034800c000*/     IADD R1, R1, -0x8;

/*0018*/     /*0x88009c042c000000*/     S2R R2, SR_Tid_Y;

/*0020*/     /*0xfc21dc03190e0000*/     ISETP.EQ.U32.AND P0, pt, R2, RZ, pt;

/*0028*/     /*0x84001c042c000000*/     S2R R0, SR_Tid_X;

/*0030*/     /*0xfc01dc0319000000*/     ISETP.EQ.U32.AND P0, pt, R0, RZ, P0;

/*0038*/     /*0xc000a1e740000000*/     @!P0 BRA.U 0x70;

/*0040*/     /*0xfc00c1e428000000*/     @P0 MOV R3, RZ;

/*0048*/     /*0x940000042c000000*/     @P0 S2R R0, SR_CTAid_X;

/*0050*/     /*0x980080042c000000*/     @P0 S2R R2, SR_CTAid_Y;

/*0058*/     /*0x5020000320004000*/     @P0 IMAD.U32.U32 R0, R2, c[0x0][0x14],R0;

/*0060*/     /*0x2000800350004000*/     @P0 IMUL.U32.U32 R2, R0, c [0x0] [0x8];

/*0068*/     /*0x03f080a5c9000000*/     @P0 STS.64 [0x0], R2;

/*0070*/     /*0xffffdc0450ee0000*/     BAR.RED.POPC RZ, RZ;

/*0078*/     /*0x03f09ca5c1000000*/     LDS.64 R2, [0x0];

/*0080*/     /*0x20015de218000000*/     MOV32I R5, 0x8;

/*0088*/     /*0x84001c042c000000*/     S2R R0, SR_Tid_X;

/*0090*/     /*0x00201c0348010000*/     IADD R0.CC, R2, R0;

/*0098*/     /*0x83f29c8614000000*/     LDC R10, c [0x0] [0x20];

/*00a0*/     /*0x74011c035800c000*/     SHR.U32.W R4, R0, 0x1d;

/*00a8*/     /*0xfc309c4348000000*/     IADD.X R2, R3, RZ;

/*00b0*/     /*0xa0021c03200b8000*/     IMAD.U32.U32 R8.CC, R0, R5,c[0x0] [0x28];

/*00b8*/     /*0x1020dc6340000000*/     ISCADD R3, R2, R4, 0x3;

/*00c0*/     /*0xb0325c4348004000*/     IADD.X R9, R3, c [0x0] [0x2c];

/*00c8*/     /*0x93f2dc8614000000*/     LDC R11, c [0x0] [0x24];

/*00d0*/     /*0xc0009c03200b8000*/     IMAD.U32.U32 R2.CC, R0, R5, c[0x0][0x30];

/*00d8*/     /*0xd030dc4348004000*/     IADD.X R3, R3, c [0x0] [0x34];

/*00e0*/     /*0x00811ca584000000*/     LD.E.64 R4, [R8];

/*00e8*/     /*0x00219ca584000000*/     LD.E.64 R6, [R2];

/*00f0*/     /*0x28411c01200c0000*/     DFMA R4, R4, R10, R6;

/*00f8*/     /*0x00211ca594000000*/     ST.E.64 [R2], R4;

/*0100*/     /*0x00001de780000000*/     EXIT;

......................................................
``````

compiling line & timing version O2

``````>>nvcc -Xcompiler -Wall -arch=sm_21 daxpy.c -lcublas -o axpy_02 -x cu -Xopencc -O2

>>./axpy_02

::::::::::Result CUBLAS::::::::::

Ejecucion: OK

Tiempo: [0:0:2-40] [hh:mm:sec-msec]

::::::::::::::::::::::::::::::::::::

:::::::::::Result CUDA:::::::::::

Ejecucion: OK

Tiempo: [0:0:8-166] [hh:mm:sec-msec]

::::::::::::::::::::::::::::::::::::
``````

compiling line & timing version O1

``````>>nvcc -Xcompiler -Wall -arch=sm_21 daxpy.c -lcublas -o axpy_01 -x cu -Xopencc -O1

>>./axpy_02

Ejecucion: OK

Time: [0:0:2-57] [hh:mm:sec-msec]

::::::::::::::::::::::::::::::::::::