how to implement mul.wide.u32 in C code 32-bit multiplication and 64-bit registers

how to implement mul.wide.u32 in C code?

if there is any C code which can be compiled to mul.wide.u32 ?

Input code fed into PTXAS: // http://forums.nvidia.com/index.php?showtopic=83482

[codebox].version 1.4

.target sm_13

.global .u32 ga;

.global .u32 gb;

.global .u64 gu;

.entry MULT16X16WIDE

{

.reg	.u16	a;

.reg	.u16	b;

.reg	.u32	u;

ld.global.u16 a, [ga];

ld.global.u16 b, [gb];	

mul.wide.u16 u,a,b;

st.global.u32 [gu], u;

}

.entry MULT32X32WIDE

{

.reg	.u32	a;

.reg	.u32	b;

.reg	.u64	u;

ld.global.u32 a, [ga];

ld.global.u32 b, [gb];	

mul.wide.u32 u,a,b;

st.global.u64 [gu], u;

}

.entry MULT32X32HILO

{

.reg	.u32	a;

.reg	.u32	b;

.reg	.u32	u;

.reg	.u32	v;

ld.global.u32 a, [ga];

ld.global.u32 b, [gb];	

mul.lo.u32 u,a,b;

mul.hi.u32 v,a,b;

st.global.u32 [gu], u;

st.global.u32 [gu+4],v;

}

.entry MULT64X64LO

{

.reg	.u64	a;

.reg	.u64	b;

.reg	.u64	u;

ld.global.u64 a, [ga];

ld.global.u64 b, [gb];	

mul.lo.u64 u,a,b;

st.global.u64 [gu], u;

}[/codebox]

Here is what DECUDA produced from the cubin file that PTXAS generated from that:

[codebox]// Disassembling MULT32X32HILO (0)

.entry MULT32X32HILO

{

.lmem 0

.smem 0

.reg 6

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r1, c14[0x0004]

mov.u32 $r0, g[$r0]

mov.u32 $r1, g[$r1]

mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi

mad24.lo.u32.u16.u16.u32 $r3, $r0.hi, $r1.lo, $r2

mad24.lo.u32.u16.u16.u32 $p0|$r2, $r0.hi, $r1.lo, $r2

shl.u32 $r3, $r3, 0x00000010

shl.u32 $r5, $r2, 0x00000010

shr.u32 $r2, $r2, 0x00000010

mad24.lo.u32.u16.u16.u32 $r4, $r0.lo, $r1.lo, $r3

mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r5

mov.b32 $r3, c14[0x0008]

@$p0.cf add.u32 $r2, $r2, c1[0x0000]

mov.b32 $r5, 0x00000004

mov.u32 g[$r3], $r4

mad24.lo.u32.u16.u16.u32 $r1, -$r0.hi, $r1.hi, -$r2

add.u32 $r0, $r5, c2[0x0008]// (unk1 03000000)

mov.end.u32 g[$r0], $r1

#.constseg 1:0x0000 const

#{

#d.32 0x00010000 // 0000

#}

}

// Disassembling MULT64X64LO (1)

.entry MULT64X64LO

{

.lmem 0

.smem 0

.reg 10

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r4, c14[0x0004]

mov.b64 $r2, g[$r0]

mov.b64 $r0, g[$r4]

mul24.half.lo.u32.u16.u16 $r5, $r2.lo, $r0.hi

mul24.half.lo.u32.u16.u16 $r4, $r0.hi, $r3.lo

mad24.lo.u32.u16.u16.u32 $p0|$r9, $r2.hi, $r0.lo, $r5

mad24.lo.u32.u16.u16.u32 $r8, $r0.lo, $r3.hi, $r4

mul24.lo.u32.u16.u16 $r7, $r2.lo, $r1.hi

shl.u32 $r6, $r9, 0x00000010

shr.u32 $r4, $r9, 0x00000010

shl.u32 $r8, $r8, 0x00000010

mad24.lo.u32.u16.u16.u32 $r7, $r2.hi, $r1.lo, $r7

mad24.lo.u32.u16.u16.u32 $p1|$o127, $r2.lo, $r0.lo, $r6

@$p0.cf add.u32 $r4, $r4, c1[0x0000]

mad24.lo.u32.u16.u16.u32 $r3, $r0.lo, $r3.lo, $r8

shl.u32 $r6, $r7, 0x00000010

mad24.lo.u32.u16.u16.u32 $r5, $r2.hi, $r0.lo, $r5

mad24.lo.u32.u16.u16.u32 $r7, -$r2.hi, $r0.hi, -$r4

mad24.lo.u32.u16.u16.u32 $r4, $r2.lo, $r1.lo, $r6

shl.u32 $r1, $r5, 0x00000010

add.u32 $r3, $r3, $r7

mad24.lo.u32.u16.u16.u32 $r0, $r2.lo, $r0.lo, $r1

add.u32 $r1, $r4, $r3

mov.b32 $r2, c14[0x0008]

mov.end.b64 g[$r2], $r0

#.constseg 1:0x0000 const

#{

#d.32 0x00010000 // 0000

#}

}

// Disassembling MULT32X32WIDE (2)

.entry MULT32X32WIDE

{

.lmem 0

.smem 0

.reg 5

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r1, c14[0x0004]

mov.u32 $r0, g[$r0]

mov.u32 $r1, g[$r1]

mul24.lo.u32.u16.u16 $r2, $r0.lo, $r1.hi

mad24.lo.u32.u16.u16.u32 $p0|$r3, $r0.hi, $r1.lo, $r2

shl.u32 $r4, $r3, 0x00000010

mad24.lo.u32.u16.u16.u32 $r2, $r0.hi, $r1.lo, $r2

shr.u32 $r3, $r3, 0x00000010

mad24.lo.u32.u16.u16.u32 $p1|$o127, $r0.lo, $r1.lo, $r4

shl.u32 $r2, $r2, 0x00000010

@$p0.cf add.u32 $r3, $r3, c1[0x0000]

mad24.lo.u32.u16.u16.u32 $r2, $r0.lo, $r1.lo, $r2

mad24.lo.u32.u16.u16.u32 $r3, -$r0.hi, $r1.hi, -$r3

mov.b32 $r0, c14[0x0008]

mov.end.b64 g[$r0], $r2

#.constseg 1:0x0000 const

#{

#d.32 0x00010000 // 0000

#}

}

// Disassembling MULT16X16WIDE (3)

.entry MULT16X16WIDE

{

.lmem 0

.smem 0

.reg 3

.bar 0

mov.b32 $r0, c14[0x0000]

mov.b32 $r2, c14[0x0004]

mov.u16 $r1, g[$r0]

mov.u16 $r0, g[$r2]

mov.b32 $r2, c14[0x0008]

mul24.lo.u32.u16.u16 $r0, $r1.lo, $r0.lo

mov.end.u32 g[$r2], $r0

}

[/codebox]

in C, if I want to get 3232->64bits, it will be 3232->64*64->lo = 64bits

[codebox]unsigned __int64 x1;

x1 = int32_data1;

x1 = x1*int32_data2;

------>

mul.lo.u64

[/codebox]

As the DECUDA show us, the 64*64(mul.lo.64) need more instructions than the mul.wide.32.

how to implement mul.wide.u32 in C code?

if there is any C code which can be compiled to mul.wide.u32 ?

This is a great question. And related to another question, how can we use C to specify adds with carry code, like the PTX instruction addc.cc.b32?
It’s a simple PTX instruction but there’s no easy way to tell the compiler to use it.

Both the wide mults and carry code adds are pretty fundamental to doing extended multiprecision integer math like for number theory or cryptography.

It’s certainly possible without these opcodes but that’s giving up at least half of the efficiency that the hardware (and even PTX) is able to provide.

An interesting question is that if we won’t have inline PTX calls, how would we expect such PTX opcode access to work?

It may be that if there’s just a handful of operations, they could be given __opcode() C stubs and it’d all just work.

This was done for __any() and even for __mul24() for that matter.

If this were done for all PTX opcodes, it’d make a crazy explosion of opcode functions (think of SSE coding!!) but it’s not an unreasonable solution for these primitive integer multiply and add routines.

Probably there would be several types of each with argument overloads to take care of signed versus unsigned and such.

So you might have some intrinsics with prototypes like

int __addc(int a, int b);

int __subc(int a, int b)

long long __mulWide(int a, int b);

Simon, I think you’ve worked on some of the new intrinsics like __popc and such. Without any obligation or official promises, could you say whether such intrinsic wrappers would be straightforward?

One issue which would come up would be instruction ordering, which matters for the (hidden) carry flag. __addc(a+b, c+d); would have undefined behavior. The solution would likely to tell the user “don’t do that, you fool.”

Or you could make everything explicit, by defining __addc as something like:

int __addc(int a, int b, bool cin, bool & cout);

So you can use it like:

bool carry;

rlo = __addc(alo, blo, 0, carry);

rhi = __addc(ahi, bhi, carry, carry);

From the compiler view of the hardware, the carry flag is just another register…

And the Tesla instruction set also lets you read and write the carry flag directly, so this feature becomes exposed as a side effect too.

Optimizing away the zero carry-in and unused carry-out should not be a problem either.

(But if you’re going to post in the Feature Requests thread, just ask “I want to be able to do high-precision integer arithmetic efficiently”, otherwise Tim will yell at you ;) )

Actually, it’s not another register. It can’t be set, cleared, or tested… the PTX reference guide explains this in the addc opcode description.

You’re right that this should be framed as “more efficient multiprecision math” in the wishlist thread. :-)