Error optimization

I coded the math function called axpy and I have compared the timing of my version and the version of cublas. Doing and optimizing it I have seen that the function is slower (3 times) when I compiled with O3 or O2 than when I compiled with O1 or O0. This optimization it’s for -ptxa compiler. So I will show u the code, the timing with 2 version the compile lines and the ptxa code

I think there is an error, what do u think??

axpy code:

...

template <unsigned int oper, unsigned int res, unsigned int numblock>

__global__ void My_Daxpy (TIPO alpha, TIPO *vec1, TIPO *vec2){

  __shared__ long int inc_bloc;

  long int pos;

  TIPO *ptr_vec;

if (numblock > MAX_BLOCK_AXE){

    if (threadIdx.x == 0 && threadIdx.y == 0)

      inc_bloc = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x;

  }

  else{

    if (threadIdx.x == 0)

      inc_bloc = blockIdx.x * blockDim.x;

  }

  __syncthreads();

pos = (inc_bloc + threadIdx.x);

if (oper == ADD){

    if(res == VECT1){

      ptr_vec = vec1 + pos;

      *ptr_vec = *ptr_vec * alpha + vec2[pos];

      //      vec1[pos] = vec1[pos] * alpha + vec2[pos];                                                                        

    }

    else if(res == VECT2){

      ptr_vec = vec2 + pos;

      *ptr_vec = vec1[pos] * alpha + *ptr_vec;

      //      vec2[pos] = vec1[pos] * alpha + vec2[pos];                                                                                                       

    }

    else{

      printf ("error: opcion de resultado no valida %d \n", res);

    }

  }

  else if (oper == SUB){

    if (res == VECT1){

      ptr_vec = vec1 + pos;

      *ptr_vec = *ptr_vec * alpha - vec2[pos];

      //      vec1[pos] = vec1[pos] * alpha - vec2[pos];                                                                                                   

    }

    else if(res == VECT2){

      ptr_vec = vec2 + pos;

      *ptr_vec = vec1[pos] * alpha - *ptr_vec;

      //      vec2[pos] = vec1[pos] * alpha- vec2[pos];                                                                                                           

    }

    else{

      printf ("error: opcion de resultado no valida %d \n", res);

    }

  }

  else{

    printf ("error: opcion de operacion no valida %d \n", oper);

    }

}

...

ptxa vO2 code:

.entry _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_ (

                .param .f64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha,

                .param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1,

                .param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2)

        {

        .reg .u32 %r<19>;

        .reg .u64 %rd<13>;

        .reg .f64 %fd<6>;

        .reg .pred %p<3>;

        .shared .s64 __cuda_local_var_34816_36_non_const_inc_bloc;

        // __cuda_local_var_34818_12_non_const_pos = 16

        .loc    17      115     0

$LDWbegin__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

        mov.u32         %r1, %tid.x;

        mov.u32         %r2, %tid.y;

        mov.u32         %r3, 0;

        set.eq.u32.u32  %r4, %r2, %r3;

        neg.s32         %r5, %r4;

        mov.u32         %r6, 0;

        set.eq.u32.u32  %r7, %r1, %r6;

        neg.s32         %r8, %r7;

        and.b32         %r9, %r5, %r8;

        mov.u32         %r10, 0;

        setp.eq.s32     %p1, %r9, %r10;

        @%p1 bra        $Lt_0_5634;

        .loc    17      123     0

        mov.u32         %r11, %ntid.x;

        mov.u32         %r12, %ctaid.x;

        mov.u32         %r13, %nctaid.x;

        mov.u32         %r14, %ctaid.y;

        mul.lo.u32      %r15, %r13, %r14;

        add.u32         %r16, %r12, %r15;

        mul.lo.u32      %r17, %r11, %r16;

        cvt.u64.u32     %rd1, %r17;

        st.shared.s64   [__cuda_local_var_34816_36_non_const_inc_bloc], %rd1;

$Lt_0_5634:

        .loc    17      129     0

        bar.sync        0;

        .loc    17      131     0

        ld.shared.s64   %rd2, [__cuda_local_var_34816_36_non_const_inc_bloc];

        cvt.u64.u32     %rd3, %r1;

        add.s64         %rd4, %rd2, %rd3;

        mov.s64         %rd5, %rd4;

        .loc    17      141     0

        mov.s64         %rd6, %rd5;

        mul.lo.u64      %rd7, %rd6, 8;

        ld.param.u64    %rd8, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2];

        add.u64         %rd9, %rd8, %rd7;

        ldu.global.f64  %fd1, [%rd9+0];

        ld.param.u64    %rd10, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1];

        add.u64         %rd11, %rd10, %rd7;

        ldu.global.f64  %fd2, [%rd11+0];

        ld.param.f64    %fd3, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha];

        mad.rn.f64      %fd4, %fd2, %fd3, %fd1;

        st.global.f64   [%rd9+0], %fd4;

        .loc    17      166     0

        exit;

$LDWend__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

        } // _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

...

...

...

code for sm_21

                Function : _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

        /*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];

        /*0008*/     /*0x84001c042c000000*/     S2R R0, SR_Tid_X;

        /*0010*/     /*0xfc01dc03190e0000*/     ISETP.EQ.U32.AND P0, pt, R0, RZ, pt;

        /*0018*/     /*0x88009c042c000000*/     S2R R2, SR_Tid_Y;

        /*0020*/     /*0xfc21dc0319000000*/     ISETP.EQ.U32.AND P0, pt, R2, RZ, P0;

        /*0028*/     /*0xc000a1e740000000*/     @!P0 BRA.U 0x60;

        /*0030*/     /*0x940080042c000000*/     @P0 S2R R2, SR_CTAid_X;

        /*0038*/     /*0x9800c0042c000000*/     @P0 S2R R3, SR_CTAid_Y;

        /*0040*/     /*0x5030800320044000*/     @P0 IMAD.U32.U32 R2, R3,c[0x0][0x14],R2;

        /*0048*/     /*0xfc00c1e428000000*/     @P0 MOV R3, RZ;

        /*0050*/     /*0x2020800350004000*/     @P0 IMUL.U32.U32 R2, R2, c [0x0] [0x8];

        /*0058*/     /*0x03f080a5c9000000*/     @P0 STS.64 [0x0], R2;

        /*0060*/     /*0xffffdc0450ee0000*/     BAR.RED.POPC RZ, RZ;

        /*0068*/     /*0x20015de218000000*/     MOV32I R5, 0x8;

        /*0070*/     /*0x03f09ca5c1000000*/     LDS.64 R2, [0x0];

        /*0078*/     /*0x00201c0348010000*/     IADD R0.CC, R2, R0;

        /*0080*/     /*0x74011c035800c000*/     SHR.U32.W R4, R0, 0x1d;

        /*0088*/     /*0xfc309c4348000000*/     IADD.X R2, R3, RZ;

        /*0090*/     /*0xa0021c03200b8000*/     IMAD.U32.U32 R8.CC, R0,R5,c[0x0] [0x28];

        /*0098*/     /*0x1020dc6340000000*/     ISCADD R3, R2, R4, 0x3;

        /*00a0*/     /*0xb0325c4348004000*/     IADD.X R9, R3, c [0x0] [0x2c];

        /*00a8*/     /*0x83f29c8614000000*/     LDC R10, c [0x0] [0x20];

        /*00b0*/     /*0xc0009c03200b8000*/     IMAD.U32.U32 R2.CC, R0, R5,c[0x0][0x30];

        /*00b8*/     /*0xd030dc4348004000*/     IADD.X R3, R3, c [0x0] [0x34];

        /*00c0*/     /*0x00811ca58c000000*/     LDU.E.64 R4, [R8];

        /*00c8*/     /*0x00219ca58c000000*/     LDU.E.64 R6, [R2];

        /*00d0*/     /*0x93f2dc8614000000*/     LDC R11, c [0x0] [0x24];

        /*00d8*/     /*0x28411c01200c0000*/     DFMA R4, R4, R10, R6;

        /*00e0*/     /*0x00211ca594000000*/     ST.E.64 [R2], R4;

        /*00e8*/     /*0x00001de780000000*/     EXIT;

                ......................................................

ptxa vO1 code:

.entry _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_ (

                .param .f64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha,

                .param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1,

                .param .u64 __cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2)

        {

        .reg .u32 %r<19>;

        .reg .u64 %rd<18>;

        .reg .f64 %fd<6>;

        .reg .pred %p<3>;

        .shared .s64 __cuda_local_var_34816_36_non_const_inc_bloc;

        .local .align 8 .b8 __cuda___cuda__temp__valist_array_41_328[4];

        .param .u64 __cudaparma1_vprintf;

        .param .u64 __cudaparma2_vprintf;

        // __cuda_local_var_34818_12_non_const_pos = 16

        // __cuda_local_var_34819_11_non_const_ptr_vec = 24

        // _temp__valist_args_42 = 40

        .loc    17      115     0

$LDWbegin__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

        .loc    17      122     0

        mov.u32         %r1, %tid.x;

        mov.u32         %r2, 0;

        set.eq.u32.u32  %r3, %r1, %r2;

        neg.s32         %r4, %r3;

        mov.u32         %r5, %tid.y;

        mov.u32         %r6, 0;

        set.eq.u32.u32  %r7, %r5, %r6;

        neg.s32         %r8, %r7;

        and.b32         %r9, %r4, %r8;

        mov.u32         %r10, 0;

        setp.eq.s32     %p1, %r9, %r10;

        @%p1 bra        $L_0_5890;

        .loc    17      123     0

        mov.u32         %r11, %ntid.x;

        mov.u32         %r12, %ctaid.x;

        mov.u32         %r13, %nctaid.x;

        mov.u32         %r14, %ctaid.y;

        mul.lo.u32      %r15, %r13, %r14;

        add.u32         %r16, %r12, %r15;

        mul.lo.u32      %r17, %r11, %r16;

        cvt.u64.u32     %rd1, %r17;

        st.shared.s64   [__cuda_local_var_34816_36_non_const_inc_bloc], %rd1;

$L_0_5890:

$L_0_5122:

       .loc    17      129     0

        bar.sync        0;

        .loc    17      131     0

        ld.shared.s64   %rd2, [__cuda_local_var_34816_36_non_const_inc_bloc];

        cvt.u64.u32     %rd3, %tid.x;

        add.s64         %rd4, %rd2, %rd3;

        mov.s64         %rd5, %rd4;

$L_0_6914:

        .loc    17      140     0

        ld.param.u64    %rd6, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec2];

        mov.s64         %rd7, %rd5;

        mul.lo.u64      %rd8, %rd7, 8;

        add.u64         %rd9, %rd6, %rd8;

        mov.s64         %rd10, %rd9;

        .loc    17      141     0

        mov.s64         %rd11, %rd10;

        ld.global.f64   %fd1, [%rd11+0];

        ld.param.u64    %rd12, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__vec1];

        mov.s64         %rd13, %rd5;

        mul.lo.u64      %rd14, %rd13, 8;

        add.u64         %rd15, %rd12, %rd14;

        ld.global.f64   %fd2, [%rd15+0];

        ld.param.f64    %fd3, [__cudaparm__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0__alpha];

        mad.rn.f64      %fd4, %fd2, %fd3, %fd1;

        mov.s64         %rd16, %rd10;

        st.global.f64   [%rd16+0], %fd4;

$L_0_7682:

$L_0_6146:

.loc    17      166     0

        exit;

$LDWend__Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_:

        } // _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

...

...

...

code for sm_21

                Function : _Z8My_DaxpyILj1ELj2ELj240000EEvdPdS0_

        /*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];

        /*0008*/     /*0x00001de440000000*/     NOP CC.T;

        /*0010*/     /*0x20105d034800c000*/     IADD R1, R1, -0x8;

        /*0018*/     /*0x88009c042c000000*/     S2R R2, SR_Tid_Y;

        /*0020*/     /*0xfc21dc03190e0000*/     ISETP.EQ.U32.AND P0, pt, R2, RZ, pt;

        /*0028*/     /*0x84001c042c000000*/     S2R R0, SR_Tid_X;

        /*0030*/     /*0xfc01dc0319000000*/     ISETP.EQ.U32.AND P0, pt, R0, RZ, P0;

        /*0038*/     /*0xc000a1e740000000*/     @!P0 BRA.U 0x70;

        /*0040*/     /*0xfc00c1e428000000*/     @P0 MOV R3, RZ;

        /*0048*/     /*0x940000042c000000*/     @P0 S2R R0, SR_CTAid_X;

        /*0050*/     /*0x980080042c000000*/     @P0 S2R R2, SR_CTAid_Y;

        /*0058*/     /*0x5020000320004000*/     @P0 IMAD.U32.U32 R0, R2, c[0x0][0x14],R0;

        /*0060*/     /*0x2000800350004000*/     @P0 IMUL.U32.U32 R2, R0, c [0x0] [0x8];

        /*0068*/     /*0x03f080a5c9000000*/     @P0 STS.64 [0x0], R2;

        /*0070*/     /*0xffffdc0450ee0000*/     BAR.RED.POPC RZ, RZ;

        /*0078*/     /*0x03f09ca5c1000000*/     LDS.64 R2, [0x0];

        /*0080*/     /*0x20015de218000000*/     MOV32I R5, 0x8;

        /*0088*/     /*0x84001c042c000000*/     S2R R0, SR_Tid_X;

        /*0090*/     /*0x00201c0348010000*/     IADD R0.CC, R2, R0;

        /*0098*/     /*0x83f29c8614000000*/     LDC R10, c [0x0] [0x20];

        /*00a0*/     /*0x74011c035800c000*/     SHR.U32.W R4, R0, 0x1d;

        /*00a8*/     /*0xfc309c4348000000*/     IADD.X R2, R3, RZ;

        /*00b0*/     /*0xa0021c03200b8000*/     IMAD.U32.U32 R8.CC, R0, R5,c[0x0] [0x28];

        /*00b8*/     /*0x1020dc6340000000*/     ISCADD R3, R2, R4, 0x3;

        /*00c0*/     /*0xb0325c4348004000*/     IADD.X R9, R3, c [0x0] [0x2c];

        /*00c8*/     /*0x93f2dc8614000000*/     LDC R11, c [0x0] [0x24];

        /*00d0*/     /*0xc0009c03200b8000*/     IMAD.U32.U32 R2.CC, R0, R5, c[0x0][0x30];

        /*00d8*/     /*0xd030dc4348004000*/     IADD.X R3, R3, c [0x0] [0x34];

        /*00e0*/     /*0x00811ca584000000*/     LD.E.64 R4, [R8];

        /*00e8*/     /*0x00219ca584000000*/     LD.E.64 R6, [R2];

        /*00f0*/     /*0x28411c01200c0000*/     DFMA R4, R4, R10, R6;

        /*00f8*/     /*0x00211ca594000000*/     ST.E.64 [R2], R4;

        /*0100*/     /*0x00001de780000000*/     EXIT;

                ......................................................

compiling line & timing version O2

>>nvcc -Xcompiler -Wall -arch=sm_21 daxpy.c -lcublas -o axpy_02 -x cu -Xopencc -O2

>>./axpy_02

::::::::::Result CUBLAS::::::::::

Ejecucion: OK

Resultado: OK

Tiempo: [0:0:2-40] [hh:mm:sec-msec]

::::::::::::::::::::::::::::::::::::

:::::::::::Result CUDA:::::::::::

Ejecucion: OK

Resultado: OK

Tiempo: [0:0:8-166] [hh:mm:sec-msec]

::::::::::::::::::::::::::::::::::::

compiling line & timing version O1

>>nvcc -Xcompiler -Wall -arch=sm_21 daxpy.c -lcublas -o axpy_01 -x cu -Xopencc -O1

>>./axpy_02

::::::::::Resultado CUBLAS::::::::::

Ejecucion: OK

Resultado: OK

Time: [0:0:2-57] [hh:mm:sec-msec]

::::::::::::::::::::::::::::::::::::

:::::::::::Resultado CUDA:::::::::::

Ejecucion: OK

Resultado: OK

Time: [0:0:2-24] [hh:mm:sec-msec]

::::::::::::::::::::::::::::::::::::

How can I Know which flags are used in each optimization level??? becase I didn’t found any information about it in help message nvopencc

this bug seems to be fixed in SDK 4.1