Multiplication methods

reichel · November 18, 2013, 7:41am

Is there any information about multiplication algorithm for function such as __umul64hi
Thank you

njuffa · November 18, 2013, 8:05am

__umul64hi is emulated, you can look at the generated code (which may differ based on GPU platform) by disassembling the machine code with cuobjdump --dump-sass. All variants use the grade school algorithm.

reichel · November 18, 2013, 9:38pm

Thanks,

what you mean

which may differ based on GPU platform
because if I decompile machine code it should not change when I move it on to another machine (like managed could do)? You mean that could depend on toolkit or build parameters?

In which NVIDIA document I could find description of microcode operations ( like IMAD (I try to search but only examples and presentations I found) and how to wrote using it.

njuffa · November 18, 2013, 9:45pm

The major GPU architectures are not binary compatible, that is, they use different machine code. Therefore the machine code you see generated for __umul64hi() can differ between architectures. In general, emulation sequences for emulated instructions can differ between CUDA releases on the same architecture. I am not aware of any public description of the machine code for each architecture, but most of the operations are fairly self-explanatory.

What use case requires a detailed knowledge of the implementation details of __umul64hi()?

njuffa · November 19, 2013, 12:17am

If you want to write your own 64-bit multiplication rotuines, you woul use PTX which is well-defined in the documentation that ships with CUDA. The code below shows one way of performing 64-bit integer multiplies using 32-bit operations at PTX level, for sm_2x and sm_3x.

__device__ __forceinline__ 
ulonglong2 umul64wide (unsigned long long int a, 
                       unsigned long long int b)
{
    ulonglong2 res;
#if __CUDA_ARCH__ >= 200
    asm ("{\n\t"
         ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
         "mov.b64         {alo,ahi}, %2;   \n\t"
         "mov.b64         {blo,bhi}, %3;   \n\t"
         "mul.lo.u32      r0, alo, blo;    \n\t"
         "mul.hi.u32      r1, alo, blo;    \n\t"
         "mad.lo.cc.u32   r1, alo, bhi, r1;\n\t"
         "madc.hi.u32     r2, alo, bhi,  0;\n\t"
         "mad.lo.cc.u32   r1, ahi, blo, r1;\n\t"
         "madc.hi.cc.u32  r2, ahi, blo, r2;\n\t"
         "madc.hi.u32     r3, ahi, bhi,  0;\n\t"
         "mad.lo.cc.u32   r2, ahi, bhi, r2;\n\t"
         "addc.u32        r3, r3,  0;      \n\t"
         "mov.b64         %0, {r0,r1};     \n\t"  
         "mov.b64         %1, {r2,r3};     \n\t"
         "}"
         : "=l"(res.x), "=l"(res.y)
         : "l"(a), "l"(b));
#else  /* __CUDA_ARCH__ >= 200 */
    res.y = __umul64hi (a, b);
    res.x = a * b;
#endif /* __CUDA_ARCH__ >= 200 */
    return res;
}

__device__ __forceinline__  
unsigned long long int my_umul64hi (unsigned long long int a, 
                                    unsigned long long int b)
{
    ulonglong2 t;
    t = umul64wide (a, b);
    return t.y;
}

__device__ __forceinline__  
unsigned long long int my_umul64lo (unsigned long long int a, 
                                    unsigned long long int b)
{
    ulonglong2 t;
    t = umul64wide (a, b);
    return t.x;
}

reichel · November 25, 2013, 11:24pm

Thank you for information. For me now it is a sufficient answer.

njuffa · December 20, 2013, 12:37am

Here is an improved version of umul64wide() that adds optimal (I think/hope) code for older sm_1x GPUs.

__device__ __forceinline__ 
ulonglong2 umul64wide (unsigned long long int a, 
                       unsigned long long int b)
{
    ulonglong2 res;
    asm ("{\n\t"
#if __CUDA_ARCH__ >= 200
         ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
         "mov.b64         {alo,ahi}, %2;   \n\t"
         "mov.b64         {blo,bhi}, %3;   \n\t"
         "mul.lo.u32      r0, alo, blo;    \n\t"
         "mul.hi.u32      r1, alo, blo;    \n\t"
         "mad.lo.cc.u32   r1, alo, bhi, r1;\n\t"
         "madc.hi.u32     r2, alo, bhi,  0;\n\t"
         "mad.lo.cc.u32   r1, ahi, blo, r1;\n\t"
         "madc.hi.cc.u32  r2, ahi, blo, r2;\n\t"
         "madc.hi.u32     r3, ahi, bhi,  0;\n\t"
         "mad.lo.cc.u32   r2, ahi, bhi, r2;\n\t"
         "addc.u32        r3, r3,  0;      \n\t"
         "mov.b64         %0, {r0,r1};     \n\t"  
         "mov.b64         %1, {r2,r3};     \n\t"
#else  /* __CUDA_ARCH__ >= 200 */
         ".reg .u32 r0, r1, r2, r3, r4, alo, ahi, blo, bhi;\n\t"
         "mov.b64         {alo,ahi}, %2;\n\t"
         "mov.b64         {blo,bhi}, %3;\n\t"
         "mul.lo.u32      r0, alo, blo; \n\t"
         "mul.hi.u32      r1, alo, blo; \n\t"
         "mul.lo.u32      r2, ahi, bhi; \n\t"
         "mul.hi.u32      r3, ahi, bhi; \n\t"
         "mul.lo.u32      r4, alo, bhi; \n\t"
         "add.cc.u32      r1,  r1,  r4; \n\t"
         "mul.hi.u32      r4, alo, bhi; \n\t"
         "addc.cc.u32     r2,  r2,  r4; \n\t"
         "addc.u32        r3,  r3,   0; \n\t"
         "mul.lo.u32      r4, ahi, blo; \n\t"
         "add.cc.u32      r1,  r1,  r4; \n\t"
         "mul.hi.u32      r4, ahi, blo; \n\t"
         "addc.cc.u32     r2,  r2,  r4; \n\t"
         "addc.u32        r3,  r3,   0; \n\t"
         "mov.b64         %0, {r0,r1};  \n\t"  
         "mov.b64         %1, {r2,r3};  \n\t"
#endif /* __CUDA_ARCH__ >= 200 */
         "}"
         : "=l"(res.x), "=l"(res.y)
         : "l"(a), "l"(b));
    return res;
}

Topic		Replies	Views
__umul64hi timing CUDA Programming and Performance	0	1810	June 30, 2009
Integer multiplication How to force nvcc to generate mul.hi? CUDA Programming and Performance	1	3067	August 31, 2007
32-bit multiplication and 64-bit registers CUDA Programming and Performance	6	6150	December 10, 2008
how to implement mul.wide.u32 in C code 32-bit multiplication and 64-bit registers CUDA Programming and Performance	4	2339	July 29, 2009
Faster division on 64-bit unsigned integers CUDA Programming and Performance	4	2146	July 15, 2017
mullo and mulhi instructions CUDA Programming and Performance	3	2411	September 10, 2013
int64 support and speed? CUDA Programming and Performance	8	11199	September 1, 2010
how to implement mul.wide.u32 in C code? 32-bit multiplication and 64-bit registers CUDA Programming and Performance	0	1713	July 28, 2009
PTX u32 wide multiplication How-to and performance characteristics? CUDA Programming and Performance	7	2115	October 12, 2010
32-bit number multiplication CUDA Programming and Performance	23	20730	July 1, 2012

Multiplication methods

Related topics