how to implement mul.wide.u32 in C code?
if there is any C code which can be compiled to mul.wide.u32 ?
Input code fed into PTXAS: // http://forums.nvidia.com/index.php?showtopic=83482
[codebox].version 1.4
.target sm_13
.global .u32 ga;
.global .u32 gb;
.global .u64 gu;
.entry MULT16X16WIDE
{
.reg .u16 a;
.reg .u16 b;
.reg .u32 u;
ld.global.u16 a, [ga];
ld.global.u16 b, [gb];
mul.wide.u16 u,a,b;
st.global.u32 [gu], u;
}
.entry MULT32X32WIDE
{
.reg .u32 a;
.reg .u32 b;
.reg .u64 u;
ld.global.u32 a, [ga];
ld.global.u32 b, [gb];
mul.wide.u32 u,a,b;
st.global.u64 [gu], u;
}
.entry MULT32X32HILO
{
.reg .u32 a;
.reg .u32 b;
.reg .u32 u;
.reg .u32 v;
ld.global.u32 a, [ga];
ld.global.u32 b, [gb];
mul.lo.u32 u,a,b;
mul.hi.u32 v,a,b;
st.global.u32 [gu], u;
st.global.u32 [gu+4],v;
}
.entry MULT64X64LO
{
.reg .u64 a;
.reg .u64 b;
.reg .u64 u;
ld.global.u64 a, [ga];
ld.global.u64 b, [gb];
mul.lo.u64 u,a,b;
st.global.u64 [gu], u;
}[/codebox]
Here is what DECUDA produced from the cubin file that PTXAS generated from that:
[codebox]// Disassembling MULT32X32HILO (0)
.entry MULT32X32HILO
{
.lmem 0
.smem 0
.reg 6
.bar 0
mov.b32 $r0, c14[0x0000]
mov.b32 $r1, c14[0x0004]
mov.u32 $r0, g[$r0]
mov.u32 $r1, g[$r1]
mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi
mad24.lo.u32.u16.u16.u32 $r3, $r0.hi, $r1.lo, $r2
mad24.lo.u32.u16.u16.u32 $p0|$r2, $r0.hi, $r1.lo, $r2
shl.u32 $r3, $r3, 0x00000010
shl.u32 $r5, $r2, 0x00000010
shr.u32 $r2, $r2, 0x00000010
mad24.lo.u32.u16.u16.u32 $r4, $r0.lo, $r1.lo, $r3
mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r5
mov.b32 $r3, c14[0x0008]
@$p0.cf add.u32 $r2, $r2, c1[0x0000]
mov.b32 $r5, 0x00000004
mov.u32 g[$r3], $r4
mad24.lo.u32.u16.u16.u32 $r1, -$r0.hi, $r1.hi, -$r2
add.u32 $r0, $r5, c2[0x0008]// (unk1 03000000)
mov.end.u32 g[$r0], $r1
#.constseg 1:0x0000 const
#{
#d.32 0x00010000 // 0000
#}
}
// Disassembling MULT64X64LO (1)
.entry MULT64X64LO
{
.lmem 0
.smem 0
.reg 10
.bar 0
mov.b32 $r0, c14[0x0000]
mov.b32 $r4, c14[0x0004]
mov.b64 $r2, g[$r0]
mov.b64 $r0, g[$r4]
mul24.half.lo.u32.u16.u16 $r5, $r2.lo, $r0.hi
mul24.half.lo.u32.u16.u16 $r4, $r0.hi, $r3.lo
mad24.lo.u32.u16.u16.u32 $p0|$r9, $r2.hi, $r0.lo, $r5
mad24.lo.u32.u16.u16.u32 $r8, $r0.lo, $r3.hi, $r4
mul24.lo.u32.u16.u16 $r7, $r2.lo, $r1.hi
shl.u32 $r6, $r9, 0x00000010
shr.u32 $r4, $r9, 0x00000010
shl.u32 $r8, $r8, 0x00000010
mad24.lo.u32.u16.u16.u32 $r7, $r2.hi, $r1.lo, $r7
mad24.lo.u32.u16.u16.u32 $p1|$o127, $r2.lo, $r0.lo, $r6
@$p0.cf add.u32 $r4, $r4, c1[0x0000]
mad24.lo.u32.u16.u16.u32 $r3, $r0.lo, $r3.lo, $r8
shl.u32 $r6, $r7, 0x00000010
mad24.lo.u32.u16.u16.u32 $r5, $r2.hi, $r0.lo, $r5
mad24.lo.u32.u16.u16.u32 $r7, -$r2.hi, $r0.hi, -$r4
mad24.lo.u32.u16.u16.u32 $r4, $r2.lo, $r1.lo, $r6
shl.u32 $r1, $r5, 0x00000010
add.u32 $r3, $r3, $r7
mad24.lo.u32.u16.u16.u32 $r0, $r2.lo, $r0.lo, $r1
add.u32 $r1, $r4, $r3
mov.b32 $r2, c14[0x0008]
mov.end.b64 g[$r2], $r0
#.constseg 1:0x0000 const
#{
#d.32 0x00010000 // 0000
#}
}
// Disassembling MULT32X32WIDE (2)
.entry MULT32X32WIDE
{
.lmem 0
.smem 0
.reg 5
.bar 0
mov.b32 $r0, c14[0x0000]
mov.b32 $r1, c14[0x0004]
mov.u32 $r0, g[$r0]
mov.u32 $r1, g[$r1]
mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi
mad24.lo.u32.u16.u16.u32 $p0|$r3, $r0.hi, $r1.lo, $r2
shl.u32 $r4, $r3, 0x00000010
mad24.lo.u32.u16.u16.u32 $r2, $r0.hi, $r1.lo, $r2
shr.u32 $r3, $r3, 0x00000010
mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r4
shl.u32 $r2, $r2, 0x00000010
@$p0.cf add.u32 $r3, $r3, c1[0x0000]
mad24.lo.u32.u16.u16.u32 $r2, $r0.lo, $r1.lo, $r2
mad24.lo.u32.u16.u16.u32 $r3, -$r0.hi, $r1.hi, -$r3
mov.b32 $r0, c14[0x0008]
mov.end.b64 g[$r0], $r2
#.constseg 1:0x0000 const
#{
#d.32 0x00010000 // 0000
#}
}
// Disassembling MULT16X16WIDE (3)
.entry MULT16X16WIDE
{
.lmem 0
.smem 0
.reg 3
.bar 0
mov.b32 $r0, c14[0x0000]
mov.b32 $r2, c14[0x0004]
mov.u16 $r1, g[$r0]
mov.u16 $r0, g[$r2]
mov.b32 $r2, c14[0x0008]
mul24.lo.u32.u16.u16 $r0, $r1.lo, $r0.lo
mov.end.u32 g[$r2], $r0
}
[/codebox]
in C, if I want to get 3232->64bits, it will be 3232->64*64->lo = 64bits
[codebox]unsigned __int64 x1;
x1 = int32_data1;
x1 = x1*int32_data2;
------>
mul.lo.u64
[/codebox]
As the DECUDA show us, the 64*64(mul.lo.64) need more instructions than the mul.wide.32.
how to implement mul.wide.u32 in C code?
if there is any C code which can be compiled to mul.wide.u32 ?