Integer MAD instruction

Zamborsky_Matus · October 21, 2010, 7:14pm

Hello,
I am new to CUDA. After reading the PTX ISA doc I found there is a separate MAD instruction for integer values. But when I use something like this:
a = b * c + d in my C for CUDA kernel, it is translated to sequence of multiply and add instruction in PTX output. Is there a way, how to tell the compiler to translate this to MAD instruction?

Zamborsky_Matus · October 21, 2010, 7:14pm

Hello,
I am new to CUDA. After reading the PTX ISA doc I found there is a separate MAD instruction for integer values. But when I use something like this:
a = b * c + d in my C for CUDA kernel, it is translated to sequence of multiply and add instruction in PTX output. Is there a way, how to tell the compiler to translate this to MAD instruction?

SPWorley · October 21, 2010, 7:48pm

Check the actual cubin for the real instructions the GPU executes. The ptxas intermediate output is further compiled to assembly, and the MADD optimization might be done there instead. You’ll need to use decuda or cuobjdump.

SPWorley · October 21, 2010, 7:48pm

Check the actual cubin for the real instructions the GPU executes. The ptxas intermediate output is further compiled to assembly, and the MADD optimization might be done there instead. You’ll need to use decuda or cuobjdump.

Ken_Domino · October 21, 2010, 8:16pm

You can use the inline asm directive, but it is not documented, not supported, and not recommended by the Nvidia folks.

[codebox]# cat hw.cu

include

global void fun(int * mem)

{

    int a = 3;

    int b = 5;

    int d;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(*mem), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

# COMPILE AND BUILD WITH -KEEP OPTION …

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

    // compiled with C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\\bin/../open64/lib//be.exe

    // nvopencc 3.2 built on 2010-09-09

//-----------------------------------------------------------

    // Compiling hw.compute_20.cpp3.i (C:/Users/Ken/AppData/Local/Temp/ccBI#.a03380)

    //-----------------------------------------------------------

//-----------------------------------------------------------

    // Options:

    //-----------------------------------------------------------

    //  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32

    //  -O3 (Optimization level)

    //  -g0 (Debug level)

    //  -m2 (Report advisories)

    //-----------------------------------------------------------

/// (REMOVED .FILE LINES)

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<11>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.s32         %r4, 3;

    mov.u32         %r5, %r4;

    mov.s32         %r6, 5;

    mov.u32         %r7, %r6;

           mad.lo.s32 %r8, %r3, %r5, %r7;

    mov.s32         %r9, %r8;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r9;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 180000001400dde2 mov b32 $r3 0x5

00000018: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000020: 2006c0000c001ca3 add $r0 mul s32 $r0 0x3 $r3 <<<<

00000028: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000030: 8000000000001de7 exit

#[/codebox]

Ken_Domino · October 21, 2010, 8:16pm

You can use the inline asm directive, but it is not documented, not supported, and not recommended by the Nvidia folks.

[codebox]# cat hw.cu

include

global void fun(int * mem)

{

    int a = 3;

    int b = 5;

    int d;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(*mem), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

# COMPILE AND BUILD WITH -KEEP OPTION …

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

    // compiled with C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\\bin/../open64/lib//be.exe

    // nvopencc 3.2 built on 2010-09-09

//-----------------------------------------------------------

    // Compiling hw.compute_20.cpp3.i (C:/Users/Ken/AppData/Local/Temp/ccBI#.a03380)

    //-----------------------------------------------------------

//-----------------------------------------------------------

    // Options:

    //-----------------------------------------------------------

    //  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32

    //  -O3 (Optimization level)

    //  -g0 (Debug level)

    //  -m2 (Report advisories)

    //-----------------------------------------------------------

/// (REMOVED .FILE LINES)

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<11>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.s32         %r4, 3;

    mov.u32         %r5, %r4;

    mov.s32         %r6, 5;

    mov.u32         %r7, %r6;

           mad.lo.s32 %r8, %r3, %r5, %r7;

    mov.s32         %r9, %r8;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r9;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 180000001400dde2 mov b32 $r3 0x5

00000018: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000020: 2006c0000c001ca3 add $r0 mul s32 $r0 0x3 $r3 <<<<

00000028: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000030: 8000000000001de7 exit

#[/codebox]

empty_knapsack · October 22, 2010, 5:25am

There no integer MAD in hardware, so looks like compiler not even bother to use imad in PTX as it’ll be anyway converted to sequence of multiplies (even 24-bit ones for pre-Fermi GPUs) and additions.

empty_knapsack · October 22, 2010, 5:25am

There no integer MAD in hardware, so looks like compiler not even bother to use imad in PTX as it’ll be anyway converted to sequence of multiplies (even 24-bit ones for pre-Fermi GPUs) and additions.

Zamborsky_Matus · October 22, 2010, 7:16am

thanks for explanation. After reading more staff about CUDA HW and experimenting with simple kernels I came to the same conclusion.

to kaberdude: the asm directive is a nifty trick :)

Zamborsky_Matus · October 22, 2010, 7:16am

thanks for explanation. After reading more staff about CUDA HW and experimenting with simple kernels I came to the same conclusion.

to kaberdude: the asm directive is a nifty trick :)

Ken_Domino · October 22, 2010, 3:20pm

Hi,

The translation really depends on your target architecture. PTX MAD.LO.S32 turns into multiple machine code instructions for sm_10 to sm_13. Kind of confusing, because NVIDIA’s CUOBJDUMP shows a couple IMAD machine code instructions. But, it isn’t used as how I initially thought. They are just used to perform a 32-bit integer multiplication. But, for sm_20, it translates to an integer MAD machine code instruction, or so NVDIS says so.

Ken

[codebox]# cat hw.cu

cat hw.cu

#include

global void fun(int * mem)

{

    int a = *mem;

    int b = *mem;

    int d = *mem;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(d), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

cat hw.compute_10.ptx

    .version 1.4

    .target sm_10, map_f64_to_f32

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ld.global.s32   %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_10 hw.compute_10.ptx

nvdis elf.o

ELF File…

00000000: 0423c7801000c801 mov b32 $r0 b32 s[0x10]

00000008: 80c00780d00e0001 ld b32 $r0 g14[$r0]

00000010: 0000078040010005 mul $r1 u16 $r0l u16 $r0h

00000018: 0000478060000205 add $r1 mul u16 $r0h $r0l $r1 <<<<<< IMAD

00000020: c410078030100205 shl b32 $r1 $r1 0x10

00000028: 0000478060000005 add $r1 mul u16 $r0l $r0l $r1 <<< IMAD

00000030: 20008204 add b32 $r1 $r1 $r0

00000034: 1100e800 mov b32 $r0 b32 s[0x10]

00000038: a0c00781d00e0005 st b32 g14[$r0] $r1

                       exit

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000018: 2000000000001ca3 add $r0 mul s32 $r0 $r0 $r0 <<<<<<<<<<<<<<<<< INTEGER MAD

00000020: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000028: 8000000000001de7 exit

# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)

cuobjdump -sass Debug/hw.exe

c:/Personal/tem/cuda-waste/test/hw/hw.cu:

========================================

Version        = 0x00000004

	     gpuInfoVersion = 0xa14f518d

			      key            = 5585e27e8bf2d4b8

usageMode      = -maxrregcount=32

			       debuggable     = no

ptx            = compute_10

		 code for sm_10

		 --------------

Function : _Z3funPi

/*0000*/        MOV R0, g [0x4];

/*0008*/        GLD.U32 R0, global14 [R0];

/*0010*/        IMUL.U16.U16 R1, R0L, R0H;

/*0018*/        IMAD.U16 R1, R0H, R0L, R1;

/*0020*/        SHL R1, R1, 0x10;

/*0028*/        IMAD.U16 R1, R0L, R0L, R1;

/*0030*/        IADD32 R1, R1, R0;

/*0034*/        MOV32 R0, g [0x4];

/*0038*/        GST.U32 global14 [R0], R1;

...................

[/codebox]

Ken_Domino · October 22, 2010, 3:20pm

Hi,

The translation really depends on your target architecture. PTX MAD.LO.S32 turns into multiple machine code instructions for sm_10 to sm_13. Kind of confusing, because NVIDIA’s CUOBJDUMP shows a couple IMAD machine code instructions. But, it isn’t used as how I initially thought. They are just used to perform a 32-bit integer multiplication. But, for sm_20, it translates to an integer MAD machine code instruction, or so NVDIS says so.

Ken

[codebox]# cat hw.cu

cat hw.cu

#include

global void fun(int * mem)

{

    int a = *mem;

    int b = *mem;

    int d = *mem;

    asm("\

   mad.lo.s32 %0, %1, %2, %3;"

       : "=r"(d) : "r"(d), "r"(a), "r"(b) : );

    *mem = d;

}

int main()

{

    int h = 1;

    int * d;

    cudaMalloc(&d, sizeof(int));

    cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);

    fun<<<1,1>>>(d);

    cudaThreadSynchronize();

    int rv = cudaGetLastError();

    cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Result = " << h << "\n";

    return 0;

}

cat hw.compute_10.ptx

    .version 1.4

    .target sm_10, map_f64_to_f32

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ld.global.s32   %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

cat hw.compute_20.ptx

    .version 2.2

    .target sm_20

// LINES REMOVED

.entry _Z3funPi (

            .param .u32 __cudaparm__Z3funPi_mem)

    {

    .reg .u32 %r<9>;

    .loc    28      3       0

$LDWbegin__Z3funPi:

    .loc    28      8       0

    ld.param.u32    %r1, [__cudaparm__Z3funPi_mem];

    ldu.global.s32  %r2, [%r1+0];

    mov.u32         %r3, %r2;

    mov.u32         %r4, %r2;

    mov.u32         %r5, %r2;

           mad.lo.s32 %r6, %r3, %r4, %r5;

    mov.s32         %r7, %r6;

    .loc    28      11      0

    st.global.s32   [%r1+0], %r7;

    .loc    28      12      0

    exit;

$LDWend__Z3funPi:

    } // _Z3funPi

ptxas -arch sm_10 hw.compute_10.ptx

nvdis elf.o

ELF File…

00000000: 0423c7801000c801 mov b32 $r0 b32 s[0x10]

00000008: 80c00780d00e0001 ld b32 $r0 g14[$r0]

00000010: 0000078040010005 mul $r1 u16 $r0l u16 $r0h

00000018: 0000478060000205 add $r1 mul u16 $r0h $r0l $r1 <<<<<< IMAD

00000020: c410078030100205 shl b32 $r1 $r1 0x10

00000028: 0000478060000005 add $r1 mul u16 $r0l $r0l $r1 <<< IMAD

00000030: 20008204 add b32 $r1 $r1 $r0

00000034: 1100e800 mov b32 $r0 b32 s[0x10]

00000038: a0c00781d00e0005 st b32 g14[$r0] $r1

                       exit

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

ELF File…

00000000: 2800440400005de4 mov b32 $r1 c1[0x100]

00000008: 2800400080009de4 mov b32 $r2 c0[0x20]

00000010: 8800000000201c85 ldu b32 $r0 g[$r2+0]

00000018: 2000000000001ca3 add $r0 mul s32 $r0 $r0 $r0 <<<<<<<<<<<<<<<<< INTEGER MAD

00000020: 9000000000201c85 st b32 wb g[$r2+0] $r0

00000028: 8000000000001de7 exit

# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)

cuobjdump -sass Debug/hw.exe

c:/Personal/tem/cuda-waste/test/hw/hw.cu:

========================================

Version        = 0x00000004

	     gpuInfoVersion = 0xa14f518d

			      key            = 5585e27e8bf2d4b8

usageMode      = -maxrregcount=32

			       debuggable     = no

ptx            = compute_10

		 code for sm_10

		 --------------

Function : _Z3funPi

/*0000*/        MOV R0, g [0x4];

/*0008*/        GLD.U32 R0, global14 [R0];

/*0010*/        IMUL.U16.U16 R1, R0L, R0H;

/*0018*/        IMAD.U16 R1, R0H, R0L, R1;

/*0020*/        SHL R1, R1, 0x10;

/*0028*/        IMAD.U16 R1, R0L, R0L, R1;

/*0030*/        IADD32 R1, R1, R0;

/*0034*/        MOV32 R0, g [0x4];

/*0038*/        GST.U32 global14 [R0], R1;

...................

[/codebox]

Topic		Replies	Views
A more accurate, performance-competitive implementation of expf() CUDA Programming and Performance	24	8213	November 19, 2017
Generating XMAD{.X,.CC} by PTX CUDA Programming and Performance	4	1082	February 12, 2019
long-integer multiplication: mul.wide.u64 and mul.wide.u128 CUDA Programming and Performance	31	7652	January 2, 2018
XMAD meaning CUDA Programming and Performance	17	6947	April 10, 2017
Extra MOV instructions? ...doubles the number of instructions executed... CUDA Programming and Performance	10	7604	July 15, 2010
What does LOP3.LUT mean? How is it executed? CUDA Programming and Performance	22	4230	February 8, 2024
Blackwell Integer CUDA Programming and Performance	101	1554	April 11, 2025
Bug with integer division? CUDA Programming and Performance	33	9349	September 9, 2015
Error or incomprehension, MMa ptx mixed precision Bfloat16 rtx3080 CUDA Programming and Performance	20	2230	October 12, 2021
Performance Difference DP4A vs VMIN4 using GTX1060 CUDA Programming and Performance	33	1901	May 28, 2021

Integer MAD instruction

# COMPILE AND BUILD WITH -KEEP OPTION …

cat hw.compute_20.ptx

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

# COMPILE AND BUILD WITH -KEEP OPTION …

cat hw.compute_20.ptx

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

cat hw.cu

cat hw.compute_10.ptx

cat hw.compute_20.ptx

ptxas -arch sm_10 hw.compute_10.ptx

nvdis elf.o

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)

cuobjdump -sass Debug/hw.exe

cat hw.cu

cat hw.compute_10.ptx

cat hw.compute_20.ptx

ptxas -arch sm_10 hw.compute_10.ptx

nvdis elf.o

ptxas -arch sm_20 hw.compute_20.ptx

nvdis elf.o

# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)

cuobjdump -sass Debug/hw.exe

Related topics