Here’s what I found. As you said, there doesn’t seem to be any hardware support for 32x32 bit integer multiplication, or if there is, PTXAS doesn’t know about it.
Input code fed into PTXAS:
[codebox].version 1.2
.target sm_13
.global .u32 ga;
.global .u32 gb;
.global .u64 gu;
.entry MULT16X16WIDE
{
.reg .u16 a;
.reg .u16 b;
.reg .u32 u;
ld.global.u16 a, [ga];
ld.global.u16 b, [gb];
mul.wide.u16 u,a,b;
st.global.u32 [gu], u;
}
.entry MULT32X32WIDE
{
.reg .u32 a;
.reg .u32 b;
.reg .u64 u;
ld.global.u32 a, [ga];
ld.global.u32 b, [gb];
mul.wide.u32 u,a,b;
st.global.u64 [gu], u;
}
.entry MULT32X32HILO
{
.reg .u32 a;
.reg .u32 b;
.reg .u32 u;
.reg .u32 v;
ld.global.u32 a, [ga];
ld.global.u32 b, [gb];
mul.lo.u32 u,a,b;
mul.hi.u32 v,a,b;
st.global.u32 [gu], u;
st.global.u32 [gu+4],v;
}
[/codebox]
Here is what DECUDA produced from the cubin file that PTXAS generated from that:
[codebox]// Disassembling MULT16X16WIDE (0)
000000: 10000001 2780c780 mov.b32 $r0, c14[0x0000]
000008: 10000209 2780c780 mov.b32 $r2, c14[0x0004]
000010: d00e0005 80400780 mov.u16 $r1, g[$r0]
000018: d00e0401 80400780 mov.u16 $r0, g[$r2]
000020: 10000409 2780c780 mov.b32 $r2, c14[0x0008]
000028: 40000401 00000780 mul24.lo.u32.u16.u16 $r0, $r1.lo, $r0.lo
000030: d00e0401 a0c00781 mov.end.u32 g[$r2], $r0
// Disassembling MULT32X32WIDE (1)
000000: 10000001 2780c780 mov.b32 $r0, c14[0x0000]
000008: 10000205 2780c780 mov.b32 $r1, c14[0x0004]
000010: d00e0001 80c00780 mov.u32 $r0, g[$r0]
000018: d00e0205 80c00780 mov.u32 $r1, g[$r1]
000020: 40030009 00000780 mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi
000028: 6002020d 000087c0 mad24.lo.u32.u16.u16.u32 $p0|$r3, $r0.hi, $r1.lo, $r2
000030: 30100611 c4100780 shl.u32 $r4, $r3, 0x00000010
000038: 60020209 00008780 mad24.lo.u32.u16.u16.u32 $r2, $r0.hi, $r1.lo, $r2
000040: 3010060d e4100780 shr.u32 $r3, $r3, 0x00000010
000048: 600201fd 000107d8 mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r4
000050: 30100409 c4100780 shl.u32 $r2, $r2, 0x00000010
000058: 2100060d 04400880 @$p0.cf add.u32 $r3, $r3, c1[0x0000]
000060: 60020009 00008780 mad24.lo.u32.u16.u16.u32 $r2, $r0.lo, $r1.lo, $r2
000068: 6003020d 0c00d780 mad24.lo.u32.u16.u16.u32 $r3, -$r0.hi, $r1.hi, -$r3
000070: 10000401 2780c780 mov.b32 $r0, c14[0x0008]
000078: d00e0009 a0800781 mov.end.b64 g[$r0], $r2
// segment: const (1:0000)
0000: 00010000
// Disassembling MULT32X32HILO (2)
000000: 10000001 2780c780 mov.b32 $r0, c14[0x0000]
000008: 10000205 2780c780 mov.b32 $r1, c14[0x0004]
000010: d00e0001 80c00780 mov.u32 $r0, g[$r0]
000018: d00e0205 80c00780 mov.u32 $r1, g[$r1]
000020: 40030009 00000780 mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi
000028: 6002020d 00008780 mad24.lo.u32.u16.u16.u32 $r3, $r0.hi, $r1.lo, $r2
000030: 60020209 000087c0 mad24.lo.u32.u16.u16.u32 $p0|$r2, $r0.hi, $r1.lo, $r2
000038: 3010060d c4100780 shl.u32 $r3, $r3, 0x00000010
000040: 30100415 c4100780 shl.u32 $r5, $r2, 0x00000010
000048: 30100409 e4100780 shr.u32 $r2, $r2, 0x00000010
000050: 60020011 0000c780 mad24.lo.u32.u16.u16.u32 $r4, $r0.lo, $r1.lo, $r3
000058: 600201fd 000147d8 mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r5
000060: 1000040d 2780c780 mov.b32 $r3, c14[0x0008]
000068: 21000409 04400880 @$p0.cf add.u32 $r2, $r2, c1[0x0000]
000070: 10048015 00000003 mov.b32 $r5, 0x00000004
000078: d00e0611 a0c00780 mov.u32 g[$r3], $r4
000080: 60030205 0c009780 mad24.lo.u32.u16.u16.u32 $r1, -$r0.hi, $r1.hi, -$r2
000088: 21000a01 07808780 add.u32 $r0, $r5, c2[0x0008]// (unk1 03000000)
000090: d00e0005 a0c00781 mov.end.u32 g[$r0], $r1
// segment: const (1:0000)
0000: 00010000
[/codebox]
So it seems clear that any attempt to multiply two 32-bit numbers actually expands into several smaller multiplications. Maybe I can hand-code something a little more efficient than this…