Good to hear sm_2x is working so well for your application. Looking at the 96x96->192 bit multiplication problem, it seems like it could be doable in 21 instructions, but I need to test the code thorougly first before I post. It is all too easy to introduce bugs (as demonstrated by the correction I had to make to the 64x64->128 bit multiplication code above; a bug in the multiplication code was masked by a bug in my test app). Since the IMADs are cheap on sm_2x, it may not be worthwhile to provide a special squaring routine once the multiply is working.
[Later:]
OK, the 96x96->192 bit multiplication code seems to be working (unless I have a bug in my test app). The 192/96->96 bit division for the reduction is a somewhat harder problem. CUDA uses Newton-Raphson for efficient 32-bit and 64-bit integer division and modulo, but this case is a bit different since there is a double-wide dividend. Long-hand division with radix 2**32 using the Pope-Stein algorithm could be more appropriate.
__device__ __forceinline__ void mul_96_192 (const unsigned int *a,
const unsigned int *b,
unsigned int *p)
{
unsigned int a0 = a[0];
unsigned int a1 = a[1];
unsigned int a2 = a[2];
unsigned int b0 = b[0];
unsigned int b1 = b[1];
unsigned int b2 = b[2];
unsigned int p0, p1, p2, p3, p4, p5;
asm ("{\n\t"
"mul.lo.u32 %0, %6, %9 ;\n\t" /* (a0 * b0)_lo */
"mul.hi.u32 %1, %6, %9 ;\n\t" /* (a0 * b0)_hi */
"mad.lo.cc.u32 %1, %6, %10, %1;\n\t" /* (a0 * b1)_lo */
"madc.hi.u32 %2, %6, %10, 0;\n\t" /* (a0 * b1)_hi */
"mad.lo.cc.u32 %1, %7, %9, %1;\n\t" /* (a1 * b0)_lo */
"madc.hi.cc.u32 %2, %7, %9, %2;\n\t" /* (a1 * b0)_hi */
"madc.hi.u32 %3, %6, %11, 0;\n\t" /* (a0 * b2)_hi */
"mad.lo.cc.u32 %2, %6, %11, %2;\n\t" /* (a0 * b2)_lo */
"madc.hi.cc.u32 %3, %7, %10, %3;\n\t" /* (a1 * b1)_hi */
"madc.hi.u32 %4, %7, %11, 0;\n\t" /* (a1 * b2)_hi */
"mad.lo.cc.u32 %2, %7, %10, %2;\n\t" /* (a1 * b1)_lo */
"madc.hi.cc.u32 %3, %8, %9, %3;\n\t" /* (a2 * b0)_hi */
"madc.hi.cc.u32 %4, %8, %10, %4;\n\t" /* (a2 * b1)_hi */
"madc.hi.u32 %5, %8, %11, 0;\n\t" /* (a2 * b2)_hi */
"mad.lo.cc.u32 %2, %8, %9, %2;\n\t" /* (a2 * b0)_lo */
"madc.lo.cc.u32 %3, %7, %11, %3;\n\t" /* (a1 * b2)_lo */
"madc.lo.cc.u32 %4, %8, %11, %4;\n\t" /* (a2 * b2)_lo */
"addc.u32 %5, %5, 0; \n\t" /* propagate carry */
"mad.lo.cc.u32 %3, %8, %10, %3;\n\t" /* (a2 * b1)_lo */
"addc.cc.u32 %4, %4, 0 ;\n\t" /* propagate carry */
"addc.u32 %5, %5, 0 ;\n\t" /* propagate carry */
"}"
: "=r"(p0), "=r"(p1), "=r"(p2), "=r"(p3), "=r"(p4), "=r"(p5)
: "r"(a0), "r"(a1), "r"(a2), "r"(b0), "r"(b1), "r"(b2));
p[0] = p0;
p[1] = p1;
p[2] = p2;
p[3] = p3;
p[4] = p4;
p[5] = p5;
}