how to implement mul.wide.u32 in C code? 32-bit multiplication and 64-bit registers

how to implement mul.wide.u32 in C code?

if there is any C code which can be compiled to mul.wide.u32 ?

Input code fed into PTXAS: // http://forums.nvidia.com/index.php?showtopic=83482

[codebox].version 1.4

.target sm_13

.global .u32 ga;

.global .u32 gb;

.global .u64 gu;

.entry MULT16X16WIDE

{

.reg	.u16	a;

.reg	.u16	b;

.reg	.u32	u;

ld.global.u16 a, [ga];

ld.global.u16 b, [gb];	

mul.wide.u16 u,a,b;

st.global.u32 [gu], u;

}

.entry MULT32X32WIDE

{

.reg	.u32	a;

.reg	.u32	b;

.reg	.u64	u;

ld.global.u32 a, [ga];

ld.global.u32 b, [gb];	

mul.wide.u32 u,a,b;

st.global.u64 [gu], u;

}

.entry MULT32X32HILO

{

.reg	.u32	a;

.reg	.u32	b;

.reg	.u32	u;

.reg	.u32	v;

ld.global.u32 a, [ga];

ld.global.u32 b, [gb];	

mul.lo.u32 u,a,b;

mul.hi.u32 v,a,b;

st.global.u32 [gu], u;

st.global.u32 [gu+4],v;

}

.entry MULT64X64LO

{

.reg	.u64	a;

.reg	.u64	b;

.reg	.u64	u;

ld.global.u64 a, [ga];

ld.global.u64 b, [gb];	

mul.lo.u64 u,a,b;

st.global.u64 [gu], u;

}[/codebox]

Here is what DECUDA produced from the cubin file that PTXAS generated from that:

[codebox]// Disassembling MULT32X32HILO (0)

.entry MULT32X32HILO

{

.lmem 0

.smem 0

.reg 6

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r1, c14[0x0004]

mov.u32 $r0, g[$r0]

mov.u32 $r1, g[$r1]

mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi

mad24.lo.u32.u16.u16.u32 $r3, $r0.hi, $r1.lo, $r2

mad24.lo.u32.u16.u16.u32 $p0|$r2, $r0.hi, $r1.lo, $r2

shl.u32 $r3, $r3, 0x00000010

shl.u32 $r5, $r2, 0x00000010

shr.u32 $r2, $r2, 0x00000010

mad24.lo.u32.u16.u16.u32 $r4, $r0.lo, $r1.lo, $r3

mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r5

mov.b32 $r3, c14[0x0008]

@$p0.cf add.u32 $r2, $r2, c1[0x0000]

mov.b32 $r5, 0x00000004

mov.u32 g[$r3], $r4

mad24.lo.u32.u16.u16.u32 $r1, -$r0.hi, $r1.hi, -$r2

add.u32 $r0, $r5, c2[0x0008]// (unk1 03000000)

mov.end.u32 g[$r0], $r1

#.constseg 1:0x0000 const

#{

#d.32 0x00010000 // 0000

#}

}

// Disassembling MULT64X64LO (1)

.entry MULT64X64LO

{

.lmem 0

.smem 0

.reg 10

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r4, c14[0x0004]

mov.b64 $r2, g[$r0]

mov.b64 $r0, g[$r4]

mul24.half.lo.u32.u16.u16 $r5, $r2.lo, $r0.hi

mul24.half.lo.u32.u16.u16 $r4, $r0.hi, $r3.lo

mad24.lo.u32.u16.u16.u32 $p0|$r9, $r2.hi, $r0.lo, $r5

mad24.lo.u32.u16.u16.u32 $r8, $r0.lo, $r3.hi, $r4

mul24.lo.u32.u16.u16 $r7, $r2.lo, $r1.hi

shl.u32 $r6, $r9, 0x00000010

shr.u32 $r4, $r9, 0x00000010

shl.u32 $r8, $r8, 0x00000010

mad24.lo.u32.u16.u16.u32 $r7, $r2.hi, $r1.lo, $r7

mad24.lo.u32.u16.u16.u32 $p1|$o127, $r2.lo, $r0.lo, $r6

@$p0.cf add.u32 $r4, $r4, c1[0x0000]

mad24.lo.u32.u16.u16.u32 $r3, $r0.lo, $r3.lo, $r8

shl.u32 $r6, $r7, 0x00000010

mad24.lo.u32.u16.u16.u32 $r5, $r2.hi, $r0.lo, $r5

mad24.lo.u32.u16.u16.u32 $r7, -$r2.hi, $r0.hi, -$r4

mad24.lo.u32.u16.u16.u32 $r4, $r2.lo, $r1.lo, $r6

shl.u32 $r1, $r5, 0x00000010

add.u32 $r3, $r3, $r7

mad24.lo.u32.u16.u16.u32 $r0, $r2.lo, $r0.lo, $r1

add.u32 $r1, $r4, $r3

mov.b32 $r2, c14[0x0008]

mov.end.b64 g[$r2], $r0

#.constseg 1:0x0000 const

#{

#d.32 0x00010000 // 0000

#}

}

// Disassembling MULT32X32WIDE (2)

.entry MULT32X32WIDE

{

.lmem 0

.smem 0

.reg 5

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r1, c14[0x0004]

mov.u32 $r0, g[$r0]

mov.u32 $r1, g[$r1]

mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi

mad24.lo.u32.u16.u16.u32 $p0|$r3, $r0.hi, $r1.lo, $r2

shl.u32 $r4, $r3, 0x00000010

mad24.lo.u32.u16.u16.u32 $r2, $r0.hi, $r1.lo, $r2

shr.u32 $r3, $r3, 0x00000010

mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r4

shl.u32 $r2, $r2, 0x00000010

@$p0.cf add.u32 $r3, $r3, c1[0x0000]

mad24.lo.u32.u16.u16.u32 $r2, $r0.lo, $r1.lo, $r2

mad24.lo.u32.u16.u16.u32 $r3, -$r0.hi, $r1.hi, -$r3

mov.b32 $r0, c14[0x0008]

mov.end.b64 g[$r0], $r2

#.constseg 1:0x0000 const

#{

#d.32 0x00010000 // 0000

#}

}

// Disassembling MULT16X16WIDE (3)

.entry MULT16X16WIDE

{

.lmem 0

.smem 0

.reg 3

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r2, c14[0x0004]

mov.u16 $r1, g[$r0]

mov.u16 $r0, g[$r2]

mov.b32 $r2, c14[0x0008]

mul24.lo.u32.u16.u16 $r0, $r1.lo, $r0.lo

mov.end.u32 g[$r2], $r0

}

[/codebox]

in C, if I want to get 3232->64bits, it will be 3232->64*64->lo = 64bits

[codebox]unsigned __int64 x1;

x1 = int32_data1;

x1 = x1*int32_data2;

------>

mul.lo.u64

[/codebox]

As the DECUDA show us, the 64*64(mul.lo.64) need more instructions than the mul.wide.32.

how to implement mul.wide.u32 in C code?

if there is any C code which can be compiled to mul.wide.u32 ?