Modular exponentiation & BigInteger

I need to program the Modular exponentiation with Big Integers on CUDA. Will someone please help me? Is there a BigInteger library for CUDA?

Have a look at

[url]128 bit integer on cuda? - Stack Overflow

for the 128-bit case and to

[url]c - large integer addition with CUDA - Stack Overflow

There is also a big integer package of thrust, but it seems still under development, see

[url]Google Code Archive - Long-term storage for Google Code Project Hosting.

I have problem - read BigInt numbers from file to Cuda Kernel.

256-bit add:

__device__ __forceinline__ ulonglong4 add256 (ulonglong4 a, ulonglong4 b)
    ulonglong4 res;
    asm ("{\n\t"
         ".reg .u32 o0, o1, o2, o3, o4; \n\t"
         ".reg .u32 o5, o6, o7, i8, i9; \n\t"
         ".reg .u32 i10, i11, i12, i13; \n\t"
         ".reg .u32 i14, i15, i16, i17; \n\t"
         ".reg .u32 i18, i19, i20, i21; \n\t"
         ".reg .u32 i22, i23;           \n\t"
         "mov.b64         { i8, i9}, %4;\n\t"
         "mov.b64         {i10,i11}, %5;\n\t"
         "mov.b64         {i12,i13}, %6;\n\t"
         "mov.b64         {i14,i15}, %7;\n\t"
         "mov.b64         {i16,i17}, %8;\n\t"
         "mov.b64         {i18,i19}, %9;\n\t"
         "mov.b64         {i20,i21},%10;\n\t"
         "mov.b64         {i22,i23},%11;\n\t"
         "      o0,  i8, i16; \n\t"
         "     o1,  i9, i17; \n\t"
         "     o2, i10, i18; \n\t"
         "     o3, i11, i19; \n\t"
         "     o4, i12, i20; \n\t"
         "     o5, i13, i21; \n\t"
         "     o6, i14, i22; \n\t"
         "addc.u32        o7, i15, i23; \n\t"
         "mov.b64         %0, {o0,o1};  \n\t"
         "mov.b64         %1, {o2,o3};  \n\t"
         "mov.b64         %2, {o4,o5};  \n\t"
         "mov.b64         %3, {o6,o7};  \n\t"
         : "=l"(res.x), "=l"(res.y), "=l"(res.z), "=l"(res.w)
         : "l"(a.x), "l"(a.y), "l"(a.z), "l"(a.w),
           "l"(b.x), "l"(b.y), "l"(b.z), "l"(b.w));
    return res;

Forum won’t allow me to post the multiply in one piece, needed to split:

256-bit multiply (part 1):

__device__ __forceinline__ ulonglong4 umul256 (ulonglong4 a, ulonglong4 b)
    ulonglong4 res;
    asm ("{\n\t"
         ".reg .u32 o0, o1, o2, o3, o4;    \n\t"
         ".reg .u32 o5, o6, o7, i8, i9;    \n\t"
         ".reg .u32 i10, i11, i12, i13;    \n\t"
         ".reg .u32 i14, i15, i16, i17;    \n\t"
         ".reg .u32 i18, i19, i20, i21;    \n\t"
         ".reg .u32 i22, i23;              \n\t"
         "mov.b64         { i8, i9}, %4;   \n\t"
         "mov.b64         {i10,i11}, %5;   \n\t"
         "mov.b64         {i12,i13}, %6;   \n\t"
         "mov.b64         {i14,i15}, %7;   \n\t"
         "mov.b64         {i16,i17}, %8;   \n\t"
         "mov.b64         {i18,i19}, %9;   \n\t"
         "mov.b64         {i20,i21},%10;   \n\t"
         "mov.b64         {i22,i23},%11;   \n\t"
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
         "mul.lo.u32      o0,  i8, i16;    \n\t"
         "mul.hi.u32      o1,  i8, i16;    \n\t"
         "   o1,  i8, i17, o1;\n\t"
         "madc.hi.u32     o2,  i8, i17,  0;\n\t"
         "   o1,  i9, i16, o1;\n\t"
         "  o2,  i9, i16, o2;\n\t"
         "madc.hi.u32     o3,  i8, i18,  0;\n\t"
         "   o2,  i8, i18, o2;\n\t"
         "  o3,  i9, i17, o3;\n\t"
         "madc.hi.u32     o4,  i8, i19,  0;\n\t"
         "   o2,  i9, i17, o2;\n\t"
         "  o3, i10, i16, o3;\n\t"
         "  o4,  i9, i18, o4;\n\t"
         "madc.hi.u32     o5,  i8, i20,  0;\n\t"
         "   o2, i10, i16, o2;\n\t"

Forum didn’t accept part 2 of the code. Let me try again.

256-bit multiply (part 2)

"  o3,  i8, i19, o3;\n\t"
         "  o4, i10, i17, o4;\n\t"
         "  o5,  i9, i19, o5;\n\t"
         "madc.hi.u32     o6,  i8, i21,  0;\n\t"
         "   o3,  i9, i18, o3;\n\t"
         "  o4, i11, i16, o4;\n\t"
         "  o5, i10, i18, o5;\n\t"
         "  o6,  i9, i20, o6;\n\t"
         "madc.hi.u32     o7,  i8, i22,  0;\n\t"
         "   o3, i10, i17, o3;\n\t"
         "  o4,  i8, i20, o4;\n\t"
         "  o5, i11, i17, o5;\n\t"
         "  o6, i10, i19, o6;\n\t"
         "madc.hi.u32     o7,  i9, i21, o7;\n\t"
         "   o3, i11, i16, o3;\n\t"
         "  o4,  i9, i19, o4;\n\t"
         "  o5, i12, i16, o5;\n\t"
         "  o6, i11, i18, o6;\n\t"
         "madc.hi.u32     o7, i10, i20, o7;\n\t"
         "   o4, i10, i18, o4;\n\t"
         "  o5,  i8, i21, o5;\n\t"
         "  o6, i12, i17, o6;\n\t"
         "madc.hi.u32     o7, i11, i19, o7;\n\t"
         "   o4, i11, i17, o4;\n\t"
         "  o5,  i9, i20, o5;\n\t"
         "  o6, i13, i16, o6;\n\t"
         "madc.hi.u32     o7, i12, i18, o7;\n\t"
         "   o4, i12, i16, o4;\n\t"
         "  o5, i10, i19, o5;\n\t"
         "  o6,  i8, i22, o6;\n\t"
         "madc.hi.u32     o7, i13, i17, o7;\n\t"
         "   o5, i11, i18, o5;\n\t"
         "  o6,  i9, i21, o6;\n\t"
         "madc.hi.u32     o7, i14, i16, o7;\n\t"
         "   o5, i12, i17, o5;\n\t"
         "  o6, i10, i20, o6;\n\t"
         "madc.lo.u32     o7,  i8, i23, o7;\n\t"
         "   o5, i13, i16, o5;\n\t"
         "  o6, i11, i19, o6;\n\t"
         "madc.lo.u32     o7,  i9, i22, o7;\n\t"
         "   o6, i12, i18, o6;\n\t"
         "madc.lo.u32     o7, i10, i21, o7;\n\t"
         "   o6, i13, i17, o6;\n\t"
         "madc.lo.u32     o7, i11, i20, o7;\n\t"
         "   o6, i14, i16, o6;\n\t"
         "madc.lo.u32     o7, i12, i19, o7;\n\t"
         "mad.lo.u32      o7, i13, i18, o7;\n\t"
         "mad.lo.u32      o7, i14, i17, o7;\n\t"
         "mad.lo.u32      o7, i15, i16, o7;\n\t"
#elif defined (__CUDA_ARCH__) 
#error __CUDA_ARCH__ < 200 not supported
#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) */
         "mov.b64         %0, {o0,o1};     \n\t"
         "mov.b64         %1, {o2,o3};     \n\t"
         "mov.b64         %2, {o4,o5};     \n\t"
         "mov.b64         %3, {o6,o7};     \n\t"
         : "=l"(res.x), "=l"(res.y), "=l"(res.z), "=l"(res.w)
         : "l"(a.x), "l"(a.y), "l"(a.z), "l"(a.w),
           "l"(b.x), "l"(b.y), "l"(b.z), "l"(b.w));
    return res;


Thanks for posting that!

So essentially you are using the bit-pattern of the concatenated ulonglong4 type to represent the 256 bit number?

I wanted a built-in CUDA type with maximum (16-byte) alignment for fast loads, one that can hold 256 bits. ulonglong4 seemed to fit the bill perfectly. The only drawback this this approach is that each function requires a prologue and epilogue that packs and unpacks the 32-bit chunks which are used by the hardware, but that is just bit re-interpretation using mov.b64 that does not cost anything in terms of execution time.

Thanks! :)

So how would you implement the call for this? mul256 <<1,1>> (a,b) ?

It looks like cuda-thrust-extensions has moved to GitHub 1.

From what I have read, developers who work on CPU implementations of these libraries think that the overhead of GPU memory management will make elementary bigint operations only useful for very large numbers. I’m not sure where 1024 bit numbers fall on their scale though.

Research has been done that explores the feasibility of modular exponentiation on a GPU:

“Executing Modular Exponentiation on a Graphics Accelerator” 1

These researchers were working with 1024 bit numbers (for RSA) and found that running the calculations on a GPU was not faster until nearly 1000 operations were performed. This is not a normal use case for a bigint library, so it is no wonder it has never been implemented.

I am currently researching Miller-Rabin, so performing thousands of modular exponentiation operations in parallel fits my use case. I plan to stay on the CPU for now and focus on using all of my cores.

Simultaneously computing a lot of modexp expressions on the GPU may have a couple of good uses:

  • crypto key breaking
  • sieving for primes and prime clusters (or rather the primality testing of sieve candidates using e.g the Fermat test)