Multiplication methods

Is there any information about multiplication algorithm for function such as __umul64hi
Thank you

__umul64hi is emulated, you can look at the generated code (which may differ based on GPU platform) by disassembling the machine code with cuobjdump --dump-sass. All variants use the grade school algorithm.

Thanks,

what you mean

which may differ based on GPU platform
because if I decompile machine code it should not change when I move it on to another machine (like managed could do)? You mean that could depend on toolkit or build parameters?

In which NVIDIA document I could find description of microcode operations ( like IMAD (I try to search but only examples and presentations I found) and how to wrote using it.

The major GPU architectures are not binary compatible, that is, they use different machine code. Therefore the machine code you see generated for __umul64hi() can differ between architectures. In general, emulation sequences for emulated instructions can differ between CUDA releases on the same architecture. I am not aware of any public description of the machine code for each architecture, but most of the operations are fairly self-explanatory.

What use case requires a detailed knowledge of the implementation details of __umul64hi()?

If you want to write your own 64-bit multiplication rotuines, you woul use PTX which is well-defined in the documentation that ships with CUDA. The code below shows one way of performing 64-bit integer multiplies using 32-bit operations at PTX level, for sm_2x and sm_3x.

__device__ __forceinline__ 
ulonglong2 umul64wide (unsigned long long int a, 
                       unsigned long long int b)
{
    ulonglong2 res;
#if __CUDA_ARCH__ >= 200
    asm ("{\n\t"
         ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
         "mov.b64         {alo,ahi}, %2;   \n\t"
         "mov.b64         {blo,bhi}, %3;   \n\t"
         "mul.lo.u32      r0, alo, blo;    \n\t"
         "mul.hi.u32      r1, alo, blo;    \n\t"
         "mad.lo.cc.u32   r1, alo, bhi, r1;\n\t"
         "madc.hi.u32     r2, alo, bhi,  0;\n\t"
         "mad.lo.cc.u32   r1, ahi, blo, r1;\n\t"
         "madc.hi.cc.u32  r2, ahi, blo, r2;\n\t"
         "madc.hi.u32     r3, ahi, bhi,  0;\n\t"
         "mad.lo.cc.u32   r2, ahi, bhi, r2;\n\t"
         "addc.u32        r3, r3,  0;      \n\t"
         "mov.b64         %0, {r0,r1};     \n\t"  
         "mov.b64         %1, {r2,r3};     \n\t"
         "}"
         : "=l"(res.x), "=l"(res.y)
         : "l"(a), "l"(b));
#else  /* __CUDA_ARCH__ >= 200 */
    res.y = __umul64hi (a, b);
    res.x = a * b;
#endif /* __CUDA_ARCH__ >= 200 */
    return res;
}

__device__ __forceinline__  
unsigned long long int my_umul64hi (unsigned long long int a, 
                                    unsigned long long int b)
{
    ulonglong2 t;
    t = umul64wide (a, b);
    return t.y;
}

__device__ __forceinline__  
unsigned long long int my_umul64lo (unsigned long long int a, 
                                    unsigned long long int b)
{
    ulonglong2 t;
    t = umul64wide (a, b);
    return t.x;
}

Thank you for information. For me now it is a sufficient answer.

Here is an improved version of umul64wide() that adds optimal (I think/hope) code for older sm_1x GPUs.

__device__ __forceinline__ 
ulonglong2 umul64wide (unsigned long long int a, 
                       unsigned long long int b)
{
    ulonglong2 res;
    asm ("{\n\t"
#if __CUDA_ARCH__ >= 200
         ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
         "mov.b64         {alo,ahi}, %2;   \n\t"
         "mov.b64         {blo,bhi}, %3;   \n\t"
         "mul.lo.u32      r0, alo, blo;    \n\t"
         "mul.hi.u32      r1, alo, blo;    \n\t"
         "mad.lo.cc.u32   r1, alo, bhi, r1;\n\t"
         "madc.hi.u32     r2, alo, bhi,  0;\n\t"
         "mad.lo.cc.u32   r1, ahi, blo, r1;\n\t"
         "madc.hi.cc.u32  r2, ahi, blo, r2;\n\t"
         "madc.hi.u32     r3, ahi, bhi,  0;\n\t"
         "mad.lo.cc.u32   r2, ahi, bhi, r2;\n\t"
         "addc.u32        r3, r3,  0;      \n\t"
         "mov.b64         %0, {r0,r1};     \n\t"  
         "mov.b64         %1, {r2,r3};     \n\t"
#else  /* __CUDA_ARCH__ >= 200 */
         ".reg .u32 r0, r1, r2, r3, r4, alo, ahi, blo, bhi;\n\t"
         "mov.b64         {alo,ahi}, %2;\n\t"
         "mov.b64         {blo,bhi}, %3;\n\t"
         "mul.lo.u32      r0, alo, blo; \n\t"
         "mul.hi.u32      r1, alo, blo; \n\t"
         "mul.lo.u32      r2, ahi, bhi; \n\t"
         "mul.hi.u32      r3, ahi, bhi; \n\t"
         "mul.lo.u32      r4, alo, bhi; \n\t"
         "add.cc.u32      r1,  r1,  r4; \n\t"
         "mul.hi.u32      r4, alo, bhi; \n\t"
         "addc.cc.u32     r2,  r2,  r4; \n\t"
         "addc.u32        r3,  r3,   0; \n\t"
         "mul.lo.u32      r4, ahi, blo; \n\t"
         "add.cc.u32      r1,  r1,  r4; \n\t"
         "mul.hi.u32      r4, ahi, blo; \n\t"
         "addc.cc.u32     r2,  r2,  r4; \n\t"
         "addc.u32        r3,  r3,   0; \n\t"
         "mov.b64         %0, {r0,r1};  \n\t"  
         "mov.b64         %1, {r2,r3};  \n\t"
#endif /* __CUDA_ARCH__ >= 200 */
         "}"
         : "=l"(res.x), "=l"(res.y)
         : "l"(a), "l"(b));
    return res;
}