Is there any information about multiplication algorithm for function such as __umul64hi
Thank you
__umul64hi is emulated, you can look at the generated code (which may differ based on GPU platform) by disassembling the machine code with cuobjdump --dump-sass. All variants use the grade school algorithm.
Thanks,
what you mean
which may differ based on GPU platform
because if I decompile machine code it should not change when I move it on to another machine (like managed could do)? You mean that could depend on toolkit or build parameters?
In which NVIDIA document I could find description of microcode operations ( like IMAD (I try to search but only examples and presentations I found) and how to wrote using it.
The major GPU architectures are not binary compatible, that is, they use different machine code. Therefore the machine code you see generated for __umul64hi() can differ between architectures. In general, emulation sequences for emulated instructions can differ between CUDA releases on the same architecture. I am not aware of any public description of the machine code for each architecture, but most of the operations are fairly self-explanatory.
What use case requires a detailed knowledge of the implementation details of __umul64hi()?
If you want to write your own 64-bit multiplication rotuines, you woul use PTX which is well-defined in the documentation that ships with CUDA. The code below shows one way of performing 64-bit integer multiplies using 32-bit operations at PTX level, for sm_2x and sm_3x.
__device__ __forceinline__
ulonglong2 umul64wide (unsigned long long int a,
unsigned long long int b)
{
ulonglong2 res;
#if __CUDA_ARCH__ >= 200
asm ("{\n\t"
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %2; \n\t"
"mov.b64 {blo,bhi}, %3; \n\t"
"mul.lo.u32 r0, alo, blo; \n\t"
"mul.hi.u32 r1, alo, blo; \n\t"
"mad.lo.cc.u32 r1, alo, bhi, r1;\n\t"
"madc.hi.u32 r2, alo, bhi, 0;\n\t"
"mad.lo.cc.u32 r1, ahi, blo, r1;\n\t"
"madc.hi.cc.u32 r2, ahi, blo, r2;\n\t"
"madc.hi.u32 r3, ahi, bhi, 0;\n\t"
"mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t"
"addc.u32 r3, r3, 0; \n\t"
"mov.b64 %0, {r0,r1}; \n\t"
"mov.b64 %1, {r2,r3}; \n\t"
"}"
: "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b));
#else /* __CUDA_ARCH__ >= 200 */
res.y = __umul64hi (a, b);
res.x = a * b;
#endif /* __CUDA_ARCH__ >= 200 */
return res;
}
__device__ __forceinline__
unsigned long long int my_umul64hi (unsigned long long int a,
unsigned long long int b)
{
ulonglong2 t;
t = umul64wide (a, b);
return t.y;
}
__device__ __forceinline__
unsigned long long int my_umul64lo (unsigned long long int a,
unsigned long long int b)
{
ulonglong2 t;
t = umul64wide (a, b);
return t.x;
}
Thank you for information. For me now it is a sufficient answer.
Here is an improved version of umul64wide() that adds optimal (I think/hope) code for older sm_1x GPUs.
__device__ __forceinline__
ulonglong2 umul64wide (unsigned long long int a,
unsigned long long int b)
{
ulonglong2 res;
asm ("{\n\t"
#if __CUDA_ARCH__ >= 200
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %2; \n\t"
"mov.b64 {blo,bhi}, %3; \n\t"
"mul.lo.u32 r0, alo, blo; \n\t"
"mul.hi.u32 r1, alo, blo; \n\t"
"mad.lo.cc.u32 r1, alo, bhi, r1;\n\t"
"madc.hi.u32 r2, alo, bhi, 0;\n\t"
"mad.lo.cc.u32 r1, ahi, blo, r1;\n\t"
"madc.hi.cc.u32 r2, ahi, blo, r2;\n\t"
"madc.hi.u32 r3, ahi, bhi, 0;\n\t"
"mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t"
"addc.u32 r3, r3, 0; \n\t"
"mov.b64 %0, {r0,r1}; \n\t"
"mov.b64 %1, {r2,r3}; \n\t"
#else /* __CUDA_ARCH__ >= 200 */
".reg .u32 r0, r1, r2, r3, r4, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %2;\n\t"
"mov.b64 {blo,bhi}, %3;\n\t"
"mul.lo.u32 r0, alo, blo; \n\t"
"mul.hi.u32 r1, alo, blo; \n\t"
"mul.lo.u32 r2, ahi, bhi; \n\t"
"mul.hi.u32 r3, ahi, bhi; \n\t"
"mul.lo.u32 r4, alo, bhi; \n\t"
"add.cc.u32 r1, r1, r4; \n\t"
"mul.hi.u32 r4, alo, bhi; \n\t"
"addc.cc.u32 r2, r2, r4; \n\t"
"addc.u32 r3, r3, 0; \n\t"
"mul.lo.u32 r4, ahi, blo; \n\t"
"add.cc.u32 r1, r1, r4; \n\t"
"mul.hi.u32 r4, ahi, blo; \n\t"
"addc.cc.u32 r2, r2, r4; \n\t"
"addc.u32 r3, r3, 0; \n\t"
"mov.b64 %0, {r0,r1}; \n\t"
"mov.b64 %1, {r2,r3}; \n\t"
#endif /* __CUDA_ARCH__ >= 200 */
"}"
: "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b));
return res;
}