Integer MAD instruction

Hello,
I am new to CUDA. After reading the PTX ISA doc I found there is a separate MAD instruction for integer values. But when I use something like this:
a = b * c + d in my C for CUDA kernel, it is translated to sequence of multiply and add instruction in PTX output. Is there a way, how to tell the compiler to translate this to MAD instruction?

Hello,
I am new to CUDA. After reading the PTX ISA doc I found there is a separate MAD instruction for integer values. But when I use something like this:
a = b * c + d in my C for CUDA kernel, it is translated to sequence of multiply and add instruction in PTX output. Is there a way, how to tell the compiler to translate this to MAD instruction?

Check the actual cubin for the real instructions the GPU executes. The ptxas intermediate output is further compiled to assembly, and the MADD optimization might be done there instead. You’ll need to use decuda or cuobjdump.

Check the actual cubin for the real instructions the GPU executes. The ptxas intermediate output is further compiled to assembly, and the MADD optimization might be done there instead. You’ll need to use decuda or cuobjdump.

You can use the inline asm directive, but it is not documented, not supported, and not recommended by the Nvidia folks.

[codebox]# cat hw.cu

#include

global void fun(int * mem)

{

    int a = 3;

    int b = 5;

    int d;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(*mem), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

# COMPILE AND BUILD WITH -KEEP OPTION …

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

    // compiled with C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\\bin/../open64/lib//be.exe

    // nvopencc 3.2 built on 2010-09-09

//-----------------------------------------------------------

    // Compiling hw.compute_20.cpp3.i (C:/Users/Ken/AppData/Local/Temp/ccBI#.a03380)

    //-----------------------------------------------------------

//-----------------------------------------------------------

    // Options:

    //-----------------------------------------------------------

    //  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32

    //  -O3 (Optimization level)

    //  -g0 (Debug level)

    //  -m2 (Report advisories)

    //-----------------------------------------------------------

/// (REMOVED .FILE LINES)

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<11>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.s32         %r4, 3;

    mov.u32         %r5, %r4;

    mov.s32         %r6, 5;

    mov.u32         %r7, %r6;

           mad.lo.s32 %r8, %r3, %r5, %r7;

    mov.s32         %r9, %r8;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r9;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 180000001400dde2 mov b32 $r3 0x5

00000018: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000020: 2006c0000c001ca3 add $r0 mul s32 $r0 0x3 $r3 <<<<

00000028: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000030: 8000000000001de7 exit

#[/codebox]

You can use the inline asm directive, but it is not documented, not supported, and not recommended by the Nvidia folks.

[codebox]# cat hw.cu

#include

global void fun(int * mem)

{

    int a = 3;

    int b = 5;

    int d;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(*mem), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

# COMPILE AND BUILD WITH -KEEP OPTION …

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

    // compiled with C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\\bin/../open64/lib//be.exe

    // nvopencc 3.2 built on 2010-09-09

//-----------------------------------------------------------

    // Compiling hw.compute_20.cpp3.i (C:/Users/Ken/AppData/Local/Temp/ccBI#.a03380)

    //-----------------------------------------------------------

//-----------------------------------------------------------

    // Options:

    //-----------------------------------------------------------

    //  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32

    //  -O3 (Optimization level)

    //  -g0 (Debug level)

    //  -m2 (Report advisories)

    //-----------------------------------------------------------

/// (REMOVED .FILE LINES)

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<11>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.s32         %r4, 3;

    mov.u32         %r5, %r4;

    mov.s32         %r6, 5;

    mov.u32         %r7, %r6;

           mad.lo.s32 %r8, %r3, %r5, %r7;

    mov.s32         %r9, %r8;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r9;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 180000001400dde2 mov b32 $r3 0x5

00000018: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000020: 2006c0000c001ca3 add $r0 mul s32 $r0 0x3 $r3 <<<<

00000028: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000030: 8000000000001de7 exit

#[/codebox]

There no integer MAD in hardware, so looks like compiler not even bother to use imad in PTX as it’ll be anyway converted to sequence of multiplies (even 24-bit ones for pre-Fermi GPUs) and additions.

There no integer MAD in hardware, so looks like compiler not even bother to use imad in PTX as it’ll be anyway converted to sequence of multiplies (even 24-bit ones for pre-Fermi GPUs) and additions.

thanks for explanation. After reading more staff about CUDA HW and experimenting with simple kernels I came to the same conclusion.

to kaberdude: the asm directive is a nifty trick :)

thanks for explanation. After reading more staff about CUDA HW and experimenting with simple kernels I came to the same conclusion.

to kaberdude: the asm directive is a nifty trick :)

Hi,

The translation really depends on your target architecture. PTX MAD.LO.S32 turns into multiple machine code instructions for sm_10 to sm_13. Kind of confusing, because NVIDIA’s CUOBJDUMP shows a couple IMAD machine code instructions. But, it isn’t used as how I initially thought. They are just used to perform a 32-bit integer multiplication. But, for sm_20, it translates to an integer MAD machine code instruction, or so NVDIS says so.

Ken

[codebox]# cat hw.cu

cat hw.cu

#include

global void fun(int * mem)

{

    int a = *mem;

    int b = *mem;

    int d = *mem;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(d), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

cat hw.compute_10.ptx

    .version 1.4

    .target sm_10, map_f64_to_f32

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ld.global.s32   %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_10 hw.compute_10.ptx

nvdis elf.o

ELF File…

00000000: 0423c7801000c801 mov b32 $r0 b32 s[0x10]

00000008: 80c00780d00e0001 ld b32 $r0 g14[$r0]

00000010: 0000078040010005 mul $r1 u16 $r0l u16 $r0h

00000018: 0000478060000205 add $r1 mul u16 $r0h $r0l $r1 <<<<<< IMAD

00000020: c410078030100205 shl b32 $r1 $r1 0x10

00000028: 0000478060000005 add $r1 mul u16 $r0l $r0l $r1 <<< IMAD

00000030: 20008204 add b32 $r1 $r1 $r0

00000034: 1100e800 mov b32 $r0 b32 s[0x10]

00000038: a0c00781d00e0005 st b32 g14[$r0] $r1

                       exit

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000018: 2000000000001ca3 add $r0 mul s32 $r0 $r0 $r0 <<<<<<<<<<<<<<<<< INTEGER MAD

00000020: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000028: 8000000000001de7 exit

# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)

cuobjdump -sass Debug/hw.exe

c:/Personal/tem/cuda-waste/test/hw/hw.cu:

========================================

Version        = 0x00000004

	     gpuInfoVersion = 0xa14f518d

			      key            = 5585e27e8bf2d4b8

usageMode      = -maxrregcount=32

			       debuggable     = no

ptx            = compute_10

		 code for sm_10

		 --------------

Function : _Z3funPi

/*0000*/        MOV R0, g [0x4];

/*0008*/        GLD.U32 R0, global14 [R0];

/*0010*/        IMUL.U16.U16 R1, R0L, R0H;

/*0018*/        IMAD.U16 R1, R0H, R0L, R1;

/*0020*/        SHL R1, R1, 0x10;

/*0028*/        IMAD.U16 R1, R0L, R0L, R1;

/*0030*/        IADD32 R1, R1, R0;

/*0034*/        MOV32 R0, g [0x4];

/*0038*/        GST.U32 global14 [R0], R1;

...................

[/codebox]

Hi,

The translation really depends on your target architecture. PTX MAD.LO.S32 turns into multiple machine code instructions for sm_10 to sm_13. Kind of confusing, because NVIDIA’s CUOBJDUMP shows a couple IMAD machine code instructions. But, it isn’t used as how I initially thought. They are just used to perform a 32-bit integer multiplication. But, for sm_20, it translates to an integer MAD machine code instruction, or so NVDIS says so.

Ken

[codebox]# cat hw.cu

cat hw.cu

#include

global void fun(int * mem)

{

    int a = *mem;

    int b = *mem;

    int d = *mem;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(d), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

cat hw.compute_10.ptx

    .version 1.4

    .target sm_10, map_f64_to_f32

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ld.global.s32   %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_10 hw.compute_10.ptx

nvdis elf.o

ELF File…

00000000: 0423c7801000c801 mov b32 $r0 b32 s[0x10]

00000008: 80c00780d00e0001 ld b32 $r0 g14[$r0]

00000010: 0000078040010005 mul $r1 u16 $r0l u16 $r0h

00000018: 0000478060000205 add $r1 mul u16 $r0h $r0l $r1 <<<<<< IMAD

00000020: c410078030100205 shl b32 $r1 $r1 0x10

00000028: 0000478060000005 add $r1 mul u16 $r0l $r0l $r1 <<< IMAD

00000030: 20008204 add b32 $r1 $r1 $r0

00000034: 1100e800 mov b32 $r0 b32 s[0x10]

00000038: a0c00781d00e0005 st b32 g14[$r0] $r1

                       exit

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000018: 2000000000001ca3 add $r0 mul s32 $r0 $r0 $r0 <<<<<<<<<<<<<<<<< INTEGER MAD

00000020: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000028: 8000000000001de7 exit

# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)

cuobjdump -sass Debug/hw.exe

c:/Personal/tem/cuda-waste/test/hw/hw.cu:

========================================

Version        = 0x00000004

	     gpuInfoVersion = 0xa14f518d

			      key            = 5585e27e8bf2d4b8

usageMode      = -maxrregcount=32

			       debuggable     = no

ptx            = compute_10

		 code for sm_10

		 --------------

Function : _Z3funPi

/*0000*/        MOV R0, g [0x4];

/*0008*/        GLD.U32 R0, global14 [R0];

/*0010*/        IMUL.U16.U16 R1, R0L, R0H;

/*0018*/        IMAD.U16 R1, R0H, R0L, R1;

/*0020*/        SHL R1, R1, 0x10;

/*0028*/        IMAD.U16 R1, R0L, R0L, R1;

/*0030*/        IADD32 R1, R1, R0;

/*0034*/        MOV32 R0, g [0x4];

/*0038*/        GST.U32 global14 [R0], R1;

...................

[/codebox]