I managed to complete the code for the baseline implementation of the 256 x 256 → 512 bit multiplication using IMAD.WIDE, and it passes my smoke test. Now I am too tired to continue. I therefore decided to post what I have now, in case this is of interest to anyone. I have not done any performance evaluation, but disassembly shows that nj_umul256wide() compiles to about 200 instructions out of which 64 are IMAD.WIDE, which is the expected number for this O(n2) implementation. Obviously, the simple recursive decomposition that I used to get to the required width of the multiplication creates quite a bit of overhead.
[ code below updated 4/27/2026 ]
/*
Copyright (c) 2026, Norbert Juffa
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
__forceinline__ __device__ ulonglong2 nj_umul64wide (unsigned long long int a,
unsigned long long int b)
{
ulonglong2 res;
asm ("{\n\t"
".reg .u32 r0, r1, r2, r3, r4, r5, r8, r9, r10;\n\t"
".reg .u64 l0, l1, l2, l3, lh;\n\t"
"mov.b64 {r0,r1}, %2;\n\t"
"mov.b64 {r2,r3}, %3;\n\t"
"mul.wide.u32 l0, r0, r2;\n\t"
"mul.wide.u32 l1, r0, r3;\n\t"
"mul.wide.u32 l2, r1, r2;\n\t"
"add.u64.cc l1, l2, l1;\n\t"
"addc.u32 r10, 0, 0;\n\t"
"mov.b64 {r8,r9}, l1;\n\t"
"mov.b64 lh, {r9,r10};\n\t"
"mul.wide.u32 l3, r1, r3;\n\t"
"mov.b64 {r4,r5}, l0;\n\t"
"add.u32.cc r5, r5, r8;\n\t"
"mov.b64 l0, {r4,r5};\n\t"
"addc.u64 l3, l3, lh;\n\t"
"mov.b64 %0, l0;\n\t"
"mov.b64 %1, l3;\n\t"
"}"
: "=l"(res.x), "=l"(res.y) : "l"(a), "l"(b)
);
return res;
}
// add two half-length addends 'a' and 'b' plus a full-length addend 'c'
__forceinline__ __device__ ulonglong2 add128_ssl (unsigned long long int a,
unsigned long long int b,
ulonglong2 c)
{
ulonglong2 res;
asm ("{\n\t"
".reg .u32 a0, a1, b0, b1, cy, c0, c1, c2, c3;\n\t"
"mov.b64 {a0,a1}, %2;\n\t"
"mov.b64 {b0,b1}, %3;\n\t"
"mov.b64 {c0,c1}, %4;\n\t"
"mov.b64 {c2,c3}, %5;\n\t"
"add.u32.cc a0, a0, b0;\n\t"
"addc.u32.cc a1, a1, b1;\n\t"
"addc.u32 cy, 0, 0;\n\t"
"add.u32.cc c0, c0, a0;\n\t"
"addc.u32.cc c1, c1, a1;\n\t"
"addc.u32.cc c2, c2, cy;\n\t"
"addc.u32 c3, c3, 0;\n\t"
"mov.b64 %0, {c0,c1};\n\t"
"mov.b64 %1, {c2,c3};\n\t"
"}"
: "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b), "l"(c.x), "l"(c.y));
return res;
}
__device__ ulonglong4 nj_umul128wide (ulonglong2 a, ulonglong2 b)
{
ulonglong2 p0 = nj_umul64wide (a.x, b.x);
ulonglong2 p1 = nj_umul64wide (a.x, b.y);
ulonglong2 p2 = nj_umul64wide (a.y, b.x);
ulonglong2 p3 = nj_umul64wide (a.y, b.y);
ulonglong2 tt = add128_ssl (p0.y, p2.x, p1);
ulonglong2 hi = add128_ssl (p2.y, tt.y, p3);
ulonglong2 lo = make_ulonglong2 (p0.x, tt.x);
return make_ulonglong4 (lo.x, lo.y, hi.x, hi.y);
}
// add two half-length addends 'a' and 'b' plus a full-length addend 'c'
__forceinline__ __device__ ulonglong4 add256_ssl (ulonglong2 a, ulonglong2 b, ulonglong4 c)
{
ulonglong4 res;
asm ("{\n\t"
".reg .u32 a0, a1, a2, a3, b0, b1, b2, b3, cy;\n\t"
".reg .u32 c0, c1, c2, c3, c4, c5, c6, c7;\n\t"
"mov.b64 {a0,a1}, %4;\n\t"
"mov.b64 {a2,a3}, %5;\n\t"
"mov.b64 {b0,b1}, %6;\n\t"
"mov.b64 {b2,b3}, %7;\n\t"
"mov.b64 {c0,c1}, %8;\n\t"
"mov.b64 {c2,c3}, %9;\n\t"
"mov.b64 {c4,c5}, %10;\n\t"
"mov.b64 {c6,c7}, %11;\n\t"
"add.u32.cc a0, a0, b0;\n\t"
"addc.u32.cc a1, a1, b1;\n\t"
"addc.u32.cc a2, a2, b2;\n\t"
"addc.u32.cc a3, a3, b3;\n\t"
"addc.u32 cy, 0, 0;\n\t"
"add.u32.cc c0, c0, a0;\n\t"
"addc.u32.cc c1, c1, a1;\n\t"
"addc.u32.cc c2, c2, a2;\n\t"
"addc.u32.cc c3, c3, a3;\n\t"
"addc.u32.cc c4, c4, cy;\n\t"
"addc.u32.cc c5, c5, 0;\n\t"
"addc.u32.cc c6, c6, 0;\n\t"
"addc.u32 c7, c7, 0;\n\t"
"mov.b64 %0, {c0,c1};\n\t"
"mov.b64 %1, {c2,c3};\n\t"
"mov.b64 %2, {c4,c5};\n\t"
"mov.b64 %3, {c6,c7};\n\t"
"}"
: "=l"(res.x), "=l"(res.y), "=l"(res.z), "=l"(res.w)
: "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y),
"l"(c.x), "l"(c.y), "l"(c.z), "l"(c.w));
return res;
}
__device__ void nj_umul256wide (ulonglong4 a, ulonglong4 b, unsigned long long int *res)
{
ulonglong4 p0 = nj_umul128wide (make_ulonglong2 (a.x, a.y), make_ulonglong2 (b.x, b.y));
ulonglong4 p1 = nj_umul128wide (make_ulonglong2 (a.x, a.y), make_ulonglong2 (b.z, b.w));
ulonglong4 p2 = nj_umul128wide (make_ulonglong2 (a.z, a.w), make_ulonglong2 (b.x, b.y));
ulonglong4 p3 = nj_umul128wide (make_ulonglong2 (a.z, a.w), make_ulonglong2 (b.z, b.w));
ulonglong4 tt = add256_ssl (make_ulonglong2 (p0.z, p0.w), make_ulonglong2 (p2.x, p2.y), p1);
ulonglong4 hi = add256_ssl (make_ulonglong2 (p2.z, p2.w), make_ulonglong2 (tt.z, tt.w), p3);
ulonglong4 lo = make_ulonglong4 (p0.x, p0.y, tt.x, tt.y);
res[0] = lo.x; res[1] = lo.y; res[2] = lo.z; res[3] = lo.w;
res[4] = hi.x; res[5] = hi.y; res[6] = hi.z; res[7] = hi.w;
}