Squeezing the last 17.5% out of a compute-bound 256-bit modular arithmetic kernel (sm_89, 82.5% SM throughput)

I managed to complete the code for the baseline implementation of the 256 x 256 → 512 bit multiplication using IMAD.WIDE, and it passes my smoke test. Now I am too tired to continue. I therefore decided to post what I have now, in case this is of interest to anyone. I have not done any performance evaluation, but disassembly shows that nj_umul256wide() compiles to about 200 instructions out of which 64 are IMAD.WIDE, which is the expected number for this O(n2) implementation. Obviously, the simple recursive decomposition that I used to get to the required width of the multiplication creates quite a bit of overhead.

[ code below updated 4/27/2026 ]

/*
  Copyright (c) 2026, Norbert Juffa

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

__forceinline__ __device__ ulonglong2 nj_umul64wide (unsigned long long int a, 
                                                     unsigned long long int b)
{
    ulonglong2 res;
    asm ("{\n\t"
         ".reg .u32 r0, r1, r2, r3, r4, r5, r8, r9, r10;\n\t"
         ".reg .u64 l0, l1, l2, l3, lh;\n\t"
         "mov.b64      {r0,r1}, %2;\n\t"
         "mov.b64      {r2,r3}, %3;\n\t"
         "mul.wide.u32 l0, r0, r2;\n\t"
         "mul.wide.u32 l1, r0, r3;\n\t"
         "mul.wide.u32 l2, r1, r2;\n\t"
         "add.u64.cc   l1, l2, l1;\n\t"
         "addc.u32     r10, 0, 0;\n\t"
         "mov.b64      {r8,r9}, l1;\n\t"
         "mov.b64      lh, {r9,r10};\n\t"
         "mul.wide.u32 l3, r1, r3;\n\t"
         "mov.b64      {r4,r5}, l0;\n\t"
         "add.u32.cc   r5, r5, r8;\n\t"
         "mov.b64      l0, {r4,r5};\n\t"
         "addc.u64     l3, l3, lh;\n\t"
         "mov.b64      %0, l0;\n\t"
         "mov.b64      %1, l3;\n\t"
         "}"
         : "=l"(res.x), "=l"(res.y) : "l"(a), "l"(b)
    );
    return res;
}

// add two half-length addends 'a' and 'b' plus a full-length addend 'c'
__forceinline__ __device__ ulonglong2 add128_ssl (unsigned long long int a, 
                                                  unsigned long long int b, 
                                                  ulonglong2 c)
{
    ulonglong2 res;
    asm ("{\n\t"
         ".reg .u32 a0, a1, b0, b1, cy, c0, c1, c2, c3;\n\t"
         "mov.b64     {a0,a1}, %2;\n\t"
         "mov.b64     {b0,b1}, %3;\n\t"
         "mov.b64     {c0,c1}, %4;\n\t"
         "mov.b64     {c2,c3}, %5;\n\t"
         "add.u32.cc  a0, a0, b0;\n\t"
         "addc.u32.cc a1, a1, b1;\n\t"
         "addc.u32    cy, 0, 0;\n\t"
         "add.u32.cc  c0, c0, a0;\n\t"
         "addc.u32.cc c1, c1, a1;\n\t"
         "addc.u32.cc c2, c2, cy;\n\t"
         "addc.u32    c3, c3, 0;\n\t"
         "mov.b64     %0, {c0,c1};\n\t"
         "mov.b64     %1, {c2,c3};\n\t"
         "}"
         : "=l"(res.x), "=l"(res.y)
         : "l"(a), "l"(b), "l"(c.x), "l"(c.y));
    return res;
}

__device__ ulonglong4 nj_umul128wide (ulonglong2 a, ulonglong2 b)
{
    ulonglong2 p0 = nj_umul64wide (a.x, b.x);
    ulonglong2 p1 = nj_umul64wide (a.x, b.y);
    ulonglong2 p2 = nj_umul64wide (a.y, b.x);
    ulonglong2 p3 = nj_umul64wide (a.y, b.y);
    ulonglong2 tt = add128_ssl (p0.y, p2.x, p1);
    ulonglong2 hi = add128_ssl (p2.y, tt.y, p3);
    ulonglong2 lo = make_ulonglong2 (p0.x, tt.x);
    return make_ulonglong4 (lo.x, lo.y, hi.x, hi.y);
}

// add two half-length addends 'a' and 'b' plus a full-length addend 'c'
__forceinline__ __device__ ulonglong4 add256_ssl (ulonglong2 a, ulonglong2 b, ulonglong4 c)
{
    ulonglong4 res;
    asm ("{\n\t"
         ".reg .u32 a0, a1, a2, a3, b0, b1, b2, b3, cy;\n\t"
         ".reg .u32 c0, c1, c2, c3, c4, c5, c6, c7;\n\t"
         "mov.b64     {a0,a1}, %4;\n\t"
         "mov.b64     {a2,a3}, %5;\n\t"
         "mov.b64     {b0,b1}, %6;\n\t"
         "mov.b64     {b2,b3}, %7;\n\t"
         "mov.b64     {c0,c1}, %8;\n\t"
         "mov.b64     {c2,c3}, %9;\n\t"
         "mov.b64     {c4,c5}, %10;\n\t"
         "mov.b64     {c6,c7}, %11;\n\t"
         "add.u32.cc  a0, a0, b0;\n\t"
         "addc.u32.cc a1, a1, b1;\n\t"
         "addc.u32.cc a2, a2, b2;\n\t"
         "addc.u32.cc a3, a3, b3;\n\t"
         "addc.u32    cy, 0, 0;\n\t"
         "add.u32.cc  c0, c0, a0;\n\t"
         "addc.u32.cc c1, c1, a1;\n\t"
         "addc.u32.cc c2, c2, a2;\n\t"
         "addc.u32.cc c3, c3, a3;\n\t"
         "addc.u32.cc c4, c4, cy;\n\t"
         "addc.u32.cc c5, c5, 0;\n\t"
         "addc.u32.cc c6, c6, 0;\n\t"
         "addc.u32    c7, c7, 0;\n\t"
         "mov.b64     %0, {c0,c1};\n\t"
         "mov.b64     %1, {c2,c3};\n\t"
         "mov.b64     %2, {c4,c5};\n\t"
         "mov.b64     %3, {c6,c7};\n\t"
         "}"
         : "=l"(res.x), "=l"(res.y), "=l"(res.z), "=l"(res.w) 
         : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y),
           "l"(c.x), "l"(c.y), "l"(c.z), "l"(c.w));
    return res;
}

__device__ void nj_umul256wide (ulonglong4 a, ulonglong4 b, unsigned long long int *res)
{
    ulonglong4 p0 = nj_umul128wide (make_ulonglong2 (a.x, a.y), make_ulonglong2 (b.x, b.y));
    ulonglong4 p1 = nj_umul128wide (make_ulonglong2 (a.x, a.y), make_ulonglong2 (b.z, b.w));
    ulonglong4 p2 = nj_umul128wide (make_ulonglong2 (a.z, a.w), make_ulonglong2 (b.x, b.y));
    ulonglong4 p3 = nj_umul128wide (make_ulonglong2 (a.z, a.w), make_ulonglong2 (b.z, b.w));
    ulonglong4 tt = add256_ssl (make_ulonglong2 (p0.z, p0.w), make_ulonglong2 (p2.x, p2.y), p1);
    ulonglong4 hi = add256_ssl (make_ulonglong2 (p2.z, p2.w), make_ulonglong2 (tt.z, tt.w), p3);
    ulonglong4 lo = make_ulonglong4 (p0.x, p0.y, tt.x, tt.y);
    res[0] = lo.x; res[1] = lo.y; res[2] = lo.z; res[3] = lo.w;
    res[4] = hi.x; res[5] = hi.y; res[6] = hi.z; res[7] = hi.w;
}

I tested your nj_umul256wide against my original _ModMult in the real kernel — both produce correct results, both yield identical performance (within measurement noise). I also did a SASS comparison of three patterns for a 256×64->320 multiply:

  1. Original (mul.lo.u64 / mul.hi.u64 / add.cc.u64): 12 IMAD.WIDE.U32 + ~10 IADD3/IADD3.X helpers

  2. Your nj_umul64wide style (manual mul.wide.u32 + add.u64.cc): same instruction count and structure

  3. Pure C++ ((uint64_t)a*b + c chain): identical SASS to pattern 1

The conclusion is clear: ptxas is already aggressively coalescing 64-bit PTX patterns into IMAD.WIDE.U32. The mental model “write 32-bit PTX to force IMAD.WIDE” doesn’t apply on sm_89 with current toolkit (CUDA 12.4) — the compiler does it automatically from the higher-level idioms.


Update from real-world integration: I integrated your nj_umul256wide into my kernel and ran Nsight Compute. SM throughput jumped from 82.5% (with my original mul.lo.u64/mul.hi.u64/add.cc.u64 macros) to 85.4% with your IMAD.WIDE-based code. DRAM and L1 metrics are unchanged.

Interestingly, end-to-end kernel throughput (in my application’s own units) remained identical — the higher SM utilization comes with a slightly higher dynamic instruction count from the recursive helper functions (add128, add256), so the two effects cancel out. So we’ve moved from “82.5% efficient at X instructions” to “85.4% efficient at Y instructions” with X0.825 ≈ Y0.854. The compute ceiling is the same.

A few observations that might be useful for your continued work on optimal idioms:

  1. The add128/add256 helpers add overhead. The recursive decomposition umul256 → 4× umul128 → 16× umul64 plus all the add128/add256 calls between them adds up. In your disassembly you mentioned ~200 instructions with 64 IMAD.WIDE — meaning ~136 instructions are non-multiply overhead. That’s a lot of carry-propagation work.

  2. My kernel calls _ModMult ~2,100 times per launch, plus there are also _ModSub256, _ModAdd256, _ModNeg256, and a Bernstein-Yang _ModInv (DivStep62). The remaining 14.6% pipeline gap could come from any of these — or from interactions between them and the other parts of the kernel (hash functions).

  3. My ModMult does a final reduction step that’s specific to my prime form (p = 2^256 - C with C small). After your umul256wide produces a 512-bit result, I multiply the high 256 bits by C using the same mad.lo.cc.u32 / madc.hi.u32 pattern, then do a 256-bit add-with-carry chain. The reduction adds maybe 30-40 SASS instructions. Could there be a more efficient idiom for “multiply-by-constant + add to accumulator” when the constant fits in 33 bits?

  4. One thing I noticed in the SASS: with my original 64-bit PTX, the compiler produces IMAD.WIDE.U32 directly. With your 32-bit explicit mul.wide.u32 + add.u64.cc pattern, the compiler also produces IMAD.WIDE.U32. Both end up at the same place. But the explicit 32-bit version exposes more parallelism opportunities — would you expect the warp scheduler to find more ILP in your version due to the visible instruction-level structure?

If you find time to think about variants — particularly anything that could reduce the carry-propagation overhead between the recursive levels, or fuse the multiply-by-constant reduction — I’d be very interested. The 14.6% remaining gap might be reducible if the helper-function overhead can be folded into the IMAD.WIDE chain itself somehow.

No rush — your contribution has already been substantial. Just thought these data points might inform your continued experimentation.

Thanks for the performance data. My use of the addition functions is inefficient, as they even add words we clearly see comprise only zero bits. The first rule of carry chains is to stop propagation at the earliest possible time, which clearly is not the case here. I simply left things where they were to wrap up as quickly as possible, as I was not able to focus anymore.

It is important to keep in mind that the purpose of the code I posted is simply to establish realistic baseline performance. The goal going forward is to eliminate as many separate IADD3 instances as possible by having the carries propagate through the IMAD.WIDE wherever possible. With the old sm_3x approach based on 32-bit IMAD that was possible for about 75% of the carry propagation, if I recall correctly. The unfortunate quirk here is that we have no direct control over IMAD.WIDE usage, as the compiler is generating this instruction.

For various reasons (including “friction losses” in the schedulers and register bank conflicts) it is typically not possible to achieve more than about 85% efficiency for the compute pipes. The goal (especially if pipes are at the efficiency limit) therefore has to be to minimize dynamic instruction count, which in this case equates to minimizing static instruction count.

I cannot quantify it, because I have not had exposure to the most modern architecture, but to the best of my knowledge ILP still plays a minor role in integer-only code even on the latest GPUs (sm_100 and later). Before that efforts to expose more ILP were pretty much meaningless. Reducing dynamic instruction count is traditionally the top priority for integer-only codes. This is in stark contrast to modern CPUs. The reason for this huge discrepancy is that GPUs started out with a simple single in-order pipe per thread execution model, deriving parallelization benefits pretty much exclusively from thread-level parallelism. I forget when ILP even came back into peripheral view. It may have been around the Ampere generation, and then only for floating-point codes.

Performance of general integer-only code was improved meaningfully (mostly at the Volta architecture, if memory serves) by shifting work to three-input operations such as IADD3, LOP3, and IMAD, but a look at generated SASS shows that in many instances the third input goes to waste, being fed an RZ. I have not looked closely enough to discern whether that is a shortcoming in the compiler infrastructure (which traditionally did not have to deal with 3-input operations). or whether it is due to natural limits to operation fusion.

Thanks for clarifying
That completely changes my mental model. The fact that ILP plays no meaningful role for integer code on sm_89 was something I assumed but hadn’t seen explicitly stated by someone with insider knowledge. So my earlier experiment (writing two independent carry chains in interleaved PTX expecting ptxas to schedule them in parallel) was indeed misguided. Pure throughput reduction is the only lever.

Your observation about the wasted RZ third input in IADD3 is interesting. Looking at my SASS dump, I see this pattern repeatedly:

IADD3 R7, P0, R7, R14, RZ   ;  // RZ = wasted slot
IADD3.X R17, RZ, RZ, RZ, P0, !PT  ;  // Even worse, three RZs!

The second one (three RZ inputs) appears to just be transporting a carry/predicate. That seems like pure overhead. If the compiler could fuse those into adjacent IMAD.WIDE operations the way you described for sm_3x, that would be exactly the kind of dynamic instruction count reduction we’re after.

In your baseline nj_umul256wide, the recursive structure makes carries flow through:

  • 16× add128 calls (each becomes an add.cc.u64 + addc.u64 = 2 SASS adds → 2 IADD3)

  • add256 calls (each becomes 4 chained adds → 4 IADD3)

That’s roughly 32 + 16 = 48 IADD3 instructions just for the carry plumbing between the 64 IMAD.WIDE multiplies. If most of those could be absorbed into the IMAD.WIDE accumulator slot, we’d potentially halve the static instruction count of the multiply.

I’ll experiment with hand-optimizing your add128/add256 to skip the obviously-zero word additions, but I suspect the real gains require restructuring the column-wise accumulation so that partial products feed directly into IMAD.WIDE chains without intermediate carry-save steps. Looking forward to seeing what you come up with.

@njuffa
I tested an optimized version of your helpers where I eliminated the obvious zero-word additions. For example, instead of:

ulonglong2 tt = add128(add128(make_ulonglong2(p0.y, 0), make_ulonglong2(p2.x, 0)), p1);

I wrote a dedicated add64_to_128(uint64_t, uint64_t) that returns {sum_lo, carry} and used it for the cases where both upper words are guaranteed zero.

Result: Nsight SM throughput went from 85.42% (your original baseline) to 85.25% — essentially identical, well within measurement noise. End-to-end kernel throughput unchanged.

This is actually a useful data point: ptxas appears to already eliminate the zero-word additions automatically when it can prove the upper word is zero. The constant 0 in make_ulonglong2(p0.y, 0) is statically known, so the compiler dead-codes the redundant adds.

Now I revert to your unmodified baseline. Whatever the next optimization step is, it has to be at the IMAD.WIDE coalescing level (i.e., reducing how many separate IADD3 carry helpers ptxas emits between the multiplies), because manual elimination of obviously-zero work doesn’t move the needle.

Looking forward to the next iteration when you find the time. The 14.6% pipeline gap remains the target.

I updated the code I posted previously to clean up the addition of two-half length and one full-length operand. As you already noted, this makes no difference outside noise level, but it looks cleaner now.

More importantly, I succeeded in crafting a monolithic (rather than composited) version of nj_umul128wide(). I accumulate the odd and even columns of partial products separately, then add the resulting sums, offset by 32 bits relative to each other. This reduces the number of instructions for the 256 x 256 → 512 bit multiplication to about 165. I had hoped for a slightly larger reduction, down to about 150+ instructions. I have not looked at the generated code in detail yet, that might provide some clues as to what is going on.

The next step would be to employ the same approach to build a monolithic implementation of nj_umul256wide().

[ code below updated 4/27/2026 ]

/*
  Copyright (c) 2026, Norbert Juffa

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

__device__ ulonglong4 nj_umul128wide (ulonglong2 a, ulonglong2 b)
{
    ulonglong4 res;
    asm ("{\n\t"
         ".reg .u64 p1, p2, p3, p4, p5, p6, p7, p8;\n\t"
         ".reg .u64 p9, p10, p11, p12, p13, p14, p15, p16, lcy;\n\t"
         ".reg .u32 a0, a1, a2, a3, b0, b1, b2, b3;\n\t"
         ".reg .u32 r0, r1, r2, r3, r4, r5, r6, r7;\n\t"
         ".reg .u32 t0, t1, t2, t3, t4, t5, cy;\n\t"
         "mov.b64       {a0,a1}, %4;\n\t"
         "mov.b64       {a2,a3}, %5;\n\t"
         "mov.b64       {b0,b1}, %6;\n\t"
         "mov.b64       {b2,b3}, %7;\n\t"
         "mul.wide.u32  p1, a2, b0;\n\t"
         "mul.wide.u32  p2, a3, b1;\n\t"
         "mul.wide.u32  p3, a1, b1;\n\t"
         "mul.wide.u32  p4, a2, b2;\n\t"
         "mul.wide.u32  p5, a0, b0;\n\t"
         "mul.wide.u32  p6, a0, b2;\n\t"
         "mul.wide.u32  p7, a1, b3;\n\t"
         "mul.wide.u32  p8, a3, b3;\n\t"
         "add.u64.cc    p3, p3, p1;\n\t"
         "addc.u64.cc   p4, p4, p2;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p6, p6, p3;\n\t"
         "addc.u64.cc   p7, p7, p4;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64      p8, p8, lcy;\n\t"
         "mul.wide.u32  p9, a3, b0;\n\t"
         "mul.wide.u32  p10, a2, b1;\n\t"
         "mul.wide.u32  p11, a1, b0;\n\t"
         "mul.wide.u32  p12, a1, b2;\n\t"
         "mul.wide.u32  p13, a3, b2;\n\t"
         "mul.wide.u32  p14, a0, b1;\n\t"
         "mul.wide.u32  p15, a0, b3;\n\t"
         "mul.wide.u32  p16, a2, b3;\n\t"
         "add.u64.cc    p10, p10, p9;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p12, p12, p10;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64.cc   p13, p13, lcy;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p14, p14, p11;\n\t"
         "addc.u64.cc   p15, p15, p12;\n\t"
         "addc.u64.cc   p16, p16, p13;\n\t"
         "addc.u32      cy, cy, 0;\n\t"
         "mov.b64       {r0,r1}, p5;\n\t" 
         "mov.b64       {r2,r3}, p6;\n\t"  
         "mov.b64       {r4,r5}, p7;\n\t"  
         "mov.b64       {r6,r7}, p8;\n\t"  
         "mov.b64       {t0,t1}, p14;\n\t" 
         "mov.b64       {t2,t3}, p15;\n\t"  
         "mov.b64       {t4,t5}, p16;\n\t"  
         "add.u32.cc    r1, r1, t0;\n\t"
         "addc.u32.cc   r2, r2, t1;\n\t"
         "addc.u32.cc   r3, r3, t2;\n\t"
         "addc.u32.cc   r4, r4, t3;\n\t"
         "addc.u32.cc   r5, r5, t4;\n\t"
         "addc.u32.cc   r6, r6, t5;\n\t"
         "addc.u32      r7, r7, cy;\n\t"
         "mov.b64       %0, {r0,r1};\n\t"
         "mov.b64       %1, {r2,r3};\n\t"
         "mov.b64       %2, {r4,r5};\n\t"
         "mov.b64       %3, {r6,r7};\n\t"
         "}"
         : "=l"(res.x), "=l"(res.y), "=l"(res.z), "=l"(res.w)
         : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y)
        );
    return res;
}

Update with Nsight Compute data on the V2 monolithic nj_umul128wide:

Nsight metrics comparison:

Metric V1 Composited V2 Monolithic Delta
SM throughput 85.42% 85.02% -0.4%
DRAM throughput 19.46% 20.03% +0.57%
L1 throughput 4.76% 4.89% +0.13%

End-to-end kernel speedup: ~+4%

The SM throughput actually went down slightly from 85.4% to 85.0%, but the actual work-per-second went up by ~4%. That’s exactly the dynamic instruction count reduction you described as the goal: same compute capacity, less work per result.

  1. DRAM and L1 throughput rose slightly. That makes sense when ALU work decreases, the memory subsystem’s relative share goes up. Memory was never the bottleneck (still under 20% DRAM), but it’s now a larger fraction of total kernel time.

  2. SM throughput dropping while wall-clock time decreases is the cleanest possible signal of an efficiency gain. It means the compute pipes are doing fewer total operations to produce the same output. Exactly what you predicted.

  3. The composited V1 wasted SM cycles on helper-function plumbing (add128/add256 chains between the recursive levels). The monolithic V2 fuses the column-wise accumulation more efficiently — fewer separate IADD3 instances between the IMAD.WIDE multiplies.

    I’m now genuinely looking forward to the monolithic nj_umul256wide. If the V1->V2 progression for 128×128 reduced ~200 SASS instructions to ~165 (~17% reduction) and produced ~+4% wall-clock speedup, then the same approach applied to the full 256×256 should compound on top of this.

The 14.6% Nsight gap is now ~15.0% with V2, but the headline number is the +4% throughput — which is the only measurement that actually matters in production.

[Side remark: I am using CUDA 12.8 for my work. When switching to the CUDA 13 toolchain at Compiler Explorer, a deprecation warning is shown that states that the ulonglong4 type should be replaced with one of two substitute types that appear to differ in their alignment guarantees.]

I have updated the monolithic implementation of nj_umul128wide() that I posted earlier to improve the efficiency of the summing. For an sm_89 target, this drops this function down to 29 instructions (not counting the RET that disappears when this is function is inlined into other code). See disassembly below. With this change, the instruction count for the full 256 x 256 → 512 bit multiply drops to 152, which is what I was projecting initially.

When one draws a diagram of the partial products to be accumulated, a triangle shape results. The best accumulation strategy seems to be to start summing at the tip of the triangle, working in the direction of the triangle’s base.

diagram of even-numbered partial product columns:

p(3,3)  p(1,3)  p(0,2)  p(0,0)
        p(2,2)  p(1,1)
        p(3,1)  p(2.0)  <---- start summing here

diagram of odd-numbered partial product columns:

p(2,3)  p(0,3)  p(0,1)
p(3,2)  p(1,2)  p(1,0)
        p(2,1)
        p(3,0)  <---- start summing here

Disassembly of nj_umul128wide() compiled for an sm_89 target using CUDA 12.8:

nj_umul128wide(ulonglong2, ulonglong2):
 IMAD.WIDE.U32 R14, R7, R8, RZ 
 MOV R41, RZ 
 IMAD.WIDE.U32 R12, R6, R8, RZ 
 IMAD.WIDE.U32 R32, R5, R8, RZ 
 IMAD.WIDE.U32 R34, P3, R6, R9, R14 
 IMAD.WIDE.U32 R14, P1, R5, R9, R12 
 SEL R40, RZ, 0x1, !P3 
 IMAD.WIDE.U32 R12, R7, R9, RZ 
 IMAD.WIDE.U32 R42, P2, R4, R9, R32 
 MOV R9, RZ 
 IMAD.WIDE.U32 R34, P0, R5, R10, R34 
 IMAD.WIDE.U32.X R32, P1, R6, R10, R12, P1 
 IMAD.WIDE.U32 R14, P3, R4, R10, R14 
 IMAD.WIDE.U32 R12, R4, R8, RZ 
 SEL R8, RZ, 0x1, !P1 
 IMAD.WIDE.U32.X R40, P0, R7, R10, R40, P0 
 IMAD.WIDE.U32.X R34, P2, R4, R11.reuse, R34, P2 
 SEL R3, RZ, 0x1, !P0 
 MOV R4, R12 
 IMAD.WIDE.U32.X R32, P3, R5, R11, R32, P3 
 IADD3 R5, P4, R13, R42, RZ 
 IMAD.WIDE.U32.X R40, P2, R6, R11, R40, P2 
 IADD3.X R6, P4, R14, R43, RZ, P4, !PT 
 IMAD.WIDE.U32.X R10, R7, R11, R8, P3 
 IADD3.X R7, P4, R15, R34, RZ, P4, !PT 
 IADD3.X R8, P4, R32, R35, RZ, P4, !PT 
 IADD3.X R9, P4, R33, R40, RZ, P4, !PT 
 IADD3.X R10, P4, R10, R41, RZ, P4, !PT 
 IADD3.X R11, R11, RZ, R3, P4, P2 
 RET.ABS.NODEC R20 0x0 

A comprehensive update with cumulative test results from your nj_umul128wide evolution:

I’ve now integrated three iterations of your code into my kernel and tracked the impact at each step:

Version SM Throughput DRAM L1 Cumulative speedup
Original mul.lo.u64/mul.hi.u64 82.50% 19.50% 4.77% baseline
V1 composited (umul64wide × 4) 85.42% 19.46% 4.76% ~0% (SM up, work same)
V2 monolithic 85.02% 20.03% 4.89% ~+4%
V3 monolithic with triangle accumulation 84.59% 20.40% 4.98% ~+5.3%

The pattern is remarkably consistent across iterations: each refinement reduces dynamic instruction count, SM throughput drops slightly because the compute pipes have less work to do, while wall-clock throughput continues to climb. This is exactly the optimization signature you predicted from the start.

I upgraded my toolchain from CUDA 12.4.1 to 12.8.2 specifically to test whether your reported 29-instruction SASS for nj_umul128wide would reproduce on the newer compiler:

  • Correctness: Test cases all pass on both toolchains

  • Performance on CUDA 12.8: Wall-clock throughput identical to CUDA 12.4 within measurement noise

  • Nsight metrics on 12.8: SM 84.59%, DRAM 20.40%, L1 4.98% — bit-for-bit identical to the 12.4 results

So at least for my specific kernel and use case, the toolchain version made no measurable difference. Your code reaches the same effective performance on 12.4. That’s actually a useful data point it suggests ptxas’s IMAD.WIDE coalescing for this idiom has been stable for a while, and the optimization isn’t dependent on a specific compiler version.

The diagram you shared (start summing at the dense tip of the triangle, work toward the sparse base) was a really elegant insight. Looking at the SASS dump from V3 vs V2, I can see the IADD3 helpers between IMAD.WIDE multiplies have visibly decreased. The triangle ordering seems to give ptxas more opportunities to fuse adjacent operations into the IMAD.WIDE accumulator slot rather than emitting separate carry-propagation instructions.

Genuinely, thank you for this. When I started this thread, I had a kernel running at 82.5% SM throughput and was stuck every attempt I made to optimize it either gave no improvement or actually slowed things down. Your willingness to engage deeply with the problem, including:

  1. Correcting your own initial recommendation about IMAD.{LO|HI} when the IMAD.WIDE picture became clearer

  2. Writing baseline implementations to establish realistic targets

  3. Iterating from V1 (composited) to V2 (monolithic) to V3 (triangle pattern), each time explaining the reasoning

  4. Sharing the partial product accumulation diagrams that make the geometric intuition clear

…has been a masterclass in GPU optimization methodology. The systematic approach you used establish a baseline first, then optimize is something I’ll apply to other problems beyond this kernel.

The kernel is now at ~84.6% SM throughput with measurably more work being done per cycle than before. The 15.4% remaining gap is, by your own assessment, the architectural ceiling from scheduler friction and register bank conflicts. Whatever’s left has to come from algorithmic changes elsewhere in the kernel, not from the modular multiply itself.

Looking forward to the monolithic nj_umul256wide whenever you have time. If the V2->V3 step for 128×128 brought ~1.3% additional throughput by reducing the helper count, the same approach applied to the full 256×256 has the potential to compound on top.

I finally got a monolithic implementation of nj_umul256wide() to work. I tested it with a bit of extra care as it is easy to introduce subtle bugs when writing hundreds of lines of code in assembly language. If you plan to use this in production code, you might want to subject it to industrial-strength testing first.

When I did library work at NVIDIA, I spent about 50% of my time writing code on test scaffolding. This allowed me to deliver code at an average rate of one customer-reported bug per year, and the maximum number of tickets open against me (both filed from within NVIDIA and from external customers and beta testers) at any given time was six, if I recall correctly. While I try to apply some reasonable level of care to my hobby projects now that I am retired, I don’t put that extreme amount of work into testing anymore.

The good news is that this monolithic version drops the instruction count to 108 when built for an sm_89 target with CUDA 12.8. That is a little bit more than half the baseline implementation.

I am not sure what the maximum throughput of IMAD.WIDE is, in particular because it writes two result registers, which I suspect might limit its throughput to half of the IADD3 throughput at best. In any event the code is now dominated by IMAD.WIDE execution. I don’t know whether there are interesting hardware counters implemented in Ada Lovelace (and exposed to the public) that would allow checking on the particular throughput limits of the IMAD.WIDE hardware. The granularity provided might not be fine enough.

/*
  Copyright (c) 2026, Norbert Juffa

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

__device__ void nj_umul256wide (ulonglong4 a, ulonglong4 b, unsigned long long int *res)
{
    ulonglong4 lo, hi;
    asm ("{\n\t"
         ".reg .u64 p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14;\n\t"
         ".reg .u64 p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, p26, p27;\n\t"
         ".reg .u64 p28, p29, p30, p31, p32, p33, p34, p35, p36, p37, p38, p39, p40;\n\t"
         ".reg .u64 p41, p42, p43, p44, p45, p46, p47, p48, p49, p50, p51, p52, p53;\n\t"
         ".reg .u64 p54, p55, p56, p57, p58, p59, p60, p61, p62, p63, p64, lcy;\n\t"
         ".reg .u32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;\n\t"
         ".reg .u32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, cy;\n\t"
         ".reg .u32 a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;\n\t"
         "mov.b64       {a0,a1}, %8;\n\t"
         "mov.b64       {a2,a3}, %9;\n\t"
         "mov.b64       {a4,a5}, %10;\n\t"
         "mov.b64       {a6,a7}, %11;\n\t"
         "mov.b64       {b0,b1}, %12;\n\t"
         "mov.b64       {b2,b3}, %13;\n\t"
         "mov.b64       {b4,b5}, %14;\n\t"
         "mov.b64       {b6,b7}, %15;\n\t"
         "mul.wide.u32  p1, a6, b0;\n\t"
         "mul.wide.u32  p2, a7, b1;\n\t"
         "mul.wide.u32  p3, a5, b1;\n\t"
         "mul.wide.u32  p4, a6, b2;\n\t"
         "mul.wide.u32  p5, a4, b0;\n\t"
         "mul.wide.u32  p6, a4, b2;\n\t"
         "mul.wide.u32  p7, a5, b3;\n\t"
         "mul.wide.u32  p8, a7, b3;\n\t"
         "mul.wide.u32  p9, a3, b1;\n\t"
         "mul.wide.u32  p10, a3, b3;\n\t"
         "mul.wide.u32  p11, a4, b4;\n\t"
         "mul.wide.u32  p12, a6, b4;\n\t"
         "mul.wide.u32  p13, a2, b0;\n\t"
         "mul.wide.u32  p14, a2, b2;\n\t"
         "mul.wide.u32  p15, a2, b4;\n\t"
         "mul.wide.u32  p16, a3, b5;\n\t"
         "mul.wide.u32  p17, a5, b5;\n\t"
         "mul.wide.u32  p18, a7, b5;\n\t"
         "mul.wide.u32  p19, a1, b1;\n\t"
         "mul.wide.u32  p20, a1, b3;\n\t"
         "mul.wide.u32  p21, a1, b5;\n\t"
         "mul.wide.u32  p22, a2, b6;\n\t"
         "mul.wide.u32  p23, a4, b6;\n\t"
         "mul.wide.u32  p24, a6, b6;\n\t"
         "mul.wide.u32  p25, a0, b0;\n\t"
         "mul.wide.u32  p26, a0, b2;\n\t"
         "mul.wide.u32  p27, a0, b4;\n\t"
         "mul.wide.u32  p28, a0, b6;\n\t"
         "mul.wide.u32  p29, a1, b7;\n\t"
         "mul.wide.u32  p30, a3, b7;\n\t"
         "mul.wide.u32  p31, a5, b7;\n\t"
         "mul.wide.u32  p32, a7, b7;\n\t"
         "add.u64.cc    p3, p3, p1;\n\t"
         "addc.u64.cc   p4, p4, p2;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p6, p6, p3;\n\t"
         "addc.u64.cc   p7, p7, p4;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64.cc   p8, p8, lcy;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p9, p9, p5;\n\t"
         "addc.u64.cc   p10, p10, p6;\n\t"
         "addc.u64.cc   p11, p11, p7;\n\t"
         "addc.u64.cc   p12, p12, p8;\n\t"
         "addc.u32      cy, cy, 0;\n\t"
         "add.u64.cc    p14, p14, p9;\n\t"
         "addc.u64.cc   p15, p15, p10;\n\t"
         "addc.u64.cc   p16, p16, p11;\n\t"
         "addc.u64.cc   p17, p17, p12;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64.cc   p18, p18, lcy;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p19, p19, p13;\n\t"
         "addc.u64.cc   p20, p20, p14;\n\t"
         "addc.u64.cc   p21, p21, p15;\n\t"
         "addc.u64.cc   p22, p22, p16;\n\t"
         "addc.u64.cc   p23, p23, p17;\n\t"
         "addc.u64.cc   p24, p24, p18;\n\t"
         "addc.u32      cy, cy, 0;\n\t"
         "add.u64.cc    p26, p26, p19;\n\t"
         "addc.u64.cc   p27, p27, p20;\n\t"
         "addc.u64.cc   p28, p28, p21;\n\t"
         "addc.u64.cc   p29, p29, p22;\n\t"
         "addc.u64.cc   p30, p30, p23;\n\t"
         "addc.u64.cc   p31, p31, p24;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64      p32, p32, lcy;\n\t"
         "mul.wide.u32  p33, a7, b0;\n\t"
         "mul.wide.u32  p34, a6, b1;\n\t"
         "mul.wide.u32  p35, a5, b0;\n\t"
         "mul.wide.u32  p36, a5, b2;\n\t"
         "mul.wide.u32  p37, a7, b2;\n\t"
         "mul.wide.u32  p38, a4, b1;\n\t"
         "mul.wide.u32  p39, a4, b3;\n\t"
         "mul.wide.u32  p40, a6, b3;\n\t"
         "mul.wide.u32  p41, a3, b0;\n\t"
         "mul.wide.u32  p42, a3, b2;\n\t"
         "mul.wide.u32  p43, a3, b4;\n\t"
         "mul.wide.u32  p44, a5, b4;\n\t"
         "mul.wide.u32  p45, a7, b4;\n\t"
         "mul.wide.u32  p46, a2, b1;\n\t"
         "mul.wide.u32  p47, a2, b3;\n\t"
         "mul.wide.u32  p48, a2, b5;\n\t"
         "mul.wide.u32  p49, a4, b5;\n\t"
         "mul.wide.u32  p50, a6, b5;\n\t"
         "mul.wide.u32  p51, a1, b0;\n\t"
         "mul.wide.u32  p52, a1, b2;\n\t"
         "mul.wide.u32  p53, a1, b4;\n\t"
         "mul.wide.u32  p54, a1, b6;\n\t"
         "mul.wide.u32  p55, a3, b6;\n\t"
         "mul.wide.u32  p56, a5, b6;\n\t"
         "mul.wide.u32  p57, a7, b6;\n\t"
         "mul.wide.u32  p58, a0, b1;\n\t"
         "mul.wide.u32  p59, a0, b3;\n\t"
         "mul.wide.u32  p60, a0, b5;\n\t"
         "mul.wide.u32  p61, a0, b7;\n\t"
         "mul.wide.u32  p62, a2, b7;\n\t"
         "mul.wide.u32  p63, a4, b7;\n\t"
         "mul.wide.u32  p64, a6, b7;\n\t"
         "add.u64.cc    p34, p34, p33;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p36, p36, p34;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64.cc   p37, p37, lcy;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p38, p38, p35;\n\t"
         "addc.u64.cc   p39, p39, p36;\n\t"
         "addc.u64.cc   p40, p40, p37;\n\t"
         "addc.u32      cy, cy, 0;\n\t"
         "add.u64.cc    p42, p42, p38;\n\t"
         "addc.u64.cc   p43, p43, p39;\n\t"
         "addc.u64.cc   p44, p44, p40;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64.cc   p45, p45, lcy;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p46, p46, p41;\n\t"
         "addc.u64.cc   p47, p47, p42;\n\t"
         "addc.u64.cc   p48, p48, p43;\n\t"
         "addc.u64.cc   p49, p49, p44;\n\t"
         "addc.u64.cc   p50, p50, p45;\n\t"
         "addc.u32      cy, cy, 0;\n\t"
         "add.u64.cc    p52, p52, p46;\n\t"
         "addc.u64.cc   p53, p53, p47;\n\t"
         "addc.u64.cc   p54, p54, p48;\n\t"
         "addc.u64.cc   p55, p55, p49;\n\t"
         "addc.u64.cc   p56, p56, p50;\n\t"
         "mov.b64       lcy, {cy,0};\n\t"
         "addc.u64.cc   p57, p57, lcy;\n\t"
         "addc.u32      cy, 0, 0;\n\t"
         "add.u64.cc    p58, p58, p51;\n\t"
         "addc.u64.cc   p59, p59, p52;\n\t"
         "addc.u64.cc   p60, p60, p53;\n\t"
         "addc.u64.cc   p61, p61, p54;\n\t"
         "addc.u64.cc   p62, p62, p55;\n\t"
         "addc.u64.cc   p63, p63, p56;\n\t"
         "addc.u64.cc   p64, p64, p57;\n\t"
         "addc.u32      cy, cy, 0;\n\t"
         "mov.b64       {r0,r1}, p25;\n\t"
         "mov.b64       {r2,r3}, p26;\n\t"
         "mov.b64       {r4,r5}, p27;\n\t"
         "mov.b64       {r6,r7}, p28;\n\t"
         "mov.b64       {r8,r9}, p29;\n\t"
         "mov.b64       {r10,r11}, p30;\n\t"
         "mov.b64       {r12,r13}, p31;\n\t"
         "mov.b64       {r14,r15}, p32;\n\t"
         "mov.b64       {t0,t1}, p58;\n\t"
         "mov.b64       {t2,t3}, p59;\n\t"
         "mov.b64       {t4,t5}, p60;\n\t"
         "mov.b64       {t6,t7}, p61;\n\t"
         "mov.b64       {t8,t9}, p62;\n\t"
         "mov.b64       {t10,t11}, p63;\n\t"
         "mov.b64       {t12,t13}, p64;\n\t"
         "add.u32.cc    r1, r1, t0;\n\t"
         "addc.u32.cc   r2, r2, t1;\n\t"
         "addc.u32.cc   r3, r3, t2;\n\t"
         "addc.u32.cc   r4, r4, t3;\n\t"
         "addc.u32.cc   r5, r5, t4;\n\t"
         "addc.u32.cc   r6, r6, t5;\n\t"
         "addc.u32.cc   r7, r7, t6;\n\t"
         "addc.u32.cc   r8, r8, t7;\n\t"
         "addc.u32.cc   r9, r9, t8;\n\t"
         "addc.u32.cc   r10, r10, t9;\n\t"
         "addc.u32.cc   r11, r11, t10;\n\t"
         "addc.u32.cc   r12, r12, t11;\n\t"
         "addc.u32.cc   r13, r13, t12;\n\t"
         "addc.u32.cc   r14, r14, t13;\n\t"
         "addc.u32      r15, r15, cy;\n\t"
         "mov.b64       %0, {r0,r1};\n\t"
         "mov.b64       %1, {r2,r3};\n\t"
         "mov.b64       %2, {r4,r5};\n\t"
         "mov.b64       %3, {r6,r7};\n\t"
         "mov.b64       %4, {r8,r9};\n\t"
         "mov.b64       %5, {r10,r11};\n\t"
         "mov.b64       %6, {r12,r13};\n\t"
         "mov.b64       %7, {r14,r15};\n\t"
         "}"
         : "=l"(lo.x), "=l"(lo.y), "=l"(lo.z), "=l"(lo.w),
           "=l"(hi.x), "=l"(hi.y), "=l"(hi.z), "=l"(hi.w)
         : "l"(a.x), "l"(a.y), "l"(a.z), "l"(a.w),
           "l"(b.x), "l"(b.y), "l"(b.z), "l"(b.w));
    res[0] = lo.x; res[1] = lo.y; res[2] = lo.z; res[3] = lo.w;
    res[4] = hi.x; res[5] = hi.y; res[6] = hi.z; res[7] = hi.w;
}

Results and a sincere thank you

Metric V3 V4 Change
SM throughput 84.59% 82.20% -2.39%
DRAM throughput 20.40% 21.25% +0.85%
L1 throughput 4.98% 5.19% +0.21%

speedup over V3: ~+3.2%, bringing the cumulative gain over the original kernel to ~+8.5%.

The signature is unmistakable now: SM throughput visibly drops (less compute work per result), memory-side metrics rise (their relative share grows), and wall-clock throughput climbs. The V3->V4 step shows the largest single jump in SM throughput reduction (-2.4 percentage points), which makes sense — going from ~152 instructions to ~108 is the biggest single instruction count reduction in the entire optimization journey.

product testing: Yes, your code delivers real, validated results. Test correctness was verified at every integration step (1008/1008 tests passed including 1000 random 256×256 multiplications with comparison against an independent CPU reference). The kernel produces the expected output across many hours of continuous operation. The increased throughput is genuine work done faster, not numerical artifacts.

Where we stand: Looking at the full progression original baseline at 82.5% SM throughput producing X work units per second, now at 82.2% SM throughput producing ~1.085·X this is exactly what “minimize dynamic instruction count” looks like in practice. The SM utilization is back near the original level, but each cycle now produces measurably more useful output.

I think we’ve reached the point of diminishing returns on this specific function. The 256×256 multiply has gone from ~200 instructions to 108 about as compact as it can reasonably be without algorithmic changes (Karatsuba, Toom-Cook). Any remaining performance gap in the kernel is likely outside the multiply itself now: in the surrounding modular arithmetic, the carry-propagation across the full algorithm.

Genuinely, thank you for the depth of engagement here. Across multiple iterations spanning composited V1, monolithic V2, triangle-pattern V3, and now the full monolithic V4, you delivered code that progressively reduced dynamic instruction count and translated each reduction into measurable wall-clock gains in production. The methodology establish baseline, iterate, measure, refine was as valuable as the code itself.

For any future readers of this thread: the journey from ~200 instructions for 256×256 multiply down to 108, and the corresponding ~+8.5% kernel-level speedup, is documented here as a complete worked example of GPU integer optimization on Ada Lovelace.

Yeah, I think we are done here. At least I cannot think of anything else to try at this time. This was a fun exercise that allowed me to refresh some of my skills. Thank you for asking a detailed well-reasoned question in the forum that got the thread started.

BTW, I mentioned a researcher earlier in the thread who I recalled being active in the 2010-2020 timeframe who worked on long-integer arithmetic on GPUs. I mistakenly remembered him as Irish, maybe on account of the name, but it appears he may actually be American:

Niall Emmart, A study of high performance multiple precision arithmetic on graphic processing units. PhD dissertation, University of Massachusetts Amherst, February 2018.

The thesis is open access and available for download. Chapter 6 of the thesis is entitled “Modular exponentiation across multiple generations of GPUs.” Given the publication date, the specifics covered may no longer apply to modern GPUs, but it may contain some ideas that are still of general interest. Google Scholar shows the author as still being somewhat active in topics related to cryptography; I see one publication on implementing ECC with double-precision computation, for example.

Thank you so much @njuffa, both for the deep technical engagement throughout this thread and for the literature pointer. The Emmart thesis looks like a treasure trove Chapter 6 on modular exponentiation across GPU generations is directly relevant to my domain, and Chapter 7’s exploration of double-precision floating point for ECC is a genuinely surprising angle I hadn’t considered.

The improvements you contributed translate to real, sustained throughput gains in production. The methodology establish baseline first, iterate with measurement at each step, use the SASS-level evidence to guide decisions was as instructive as the code. I’ll be applying that approach to other GPU optimization problems.

Wishing you continued enjoyable retirement projects. Threads like this are why the NVIDIA developer forum remains valuable.

@njuffa In your closing message, you recommended Niall Emmart’s dissertation Chapter 7 (FP64 for ECC). For a 256-bit ECC kernel on RTX 4090 (1.29 TFLOPS FP64 vs 82.6 TFLOPS FP32), would you expect FP64-based multiplication to outperform the current IMAD.WIDE-based implementation? Or is the FP64 throughput penalty on consumer Ada cards a deal-breaker?

I have not looked at ECC based on double-precision computation, but the double-precision throughput on modern consumer GPUs is so low that it is highly unlikely to be useful for this. Some while ago I posted example code in this forum that demonstrated that integer-based emulation of double-precision computation can be competitive with or even faster than the FP64 hardware, at a substantial cost in register usage. I think I tried multiplication, division, and exp(). OK, found the thread:

For ECC using double-precision computation on GPUs it would probably be better to consult this publication which is more recent than Emmart’s PhD thesis:

Lili Gao, Fangyu Zheng, Rong Wei, Jiankuo Dong, Niall Emmart, Yuan Ma, “DPF-ECC: A Framework for Efficient ECC With Double Precision Floating-Point Computing Power.” IEEE Transactions on Information Forensics and Security, Vol. 16, Jul. 2021, pp. 3988 - 4002. From the abstract:

The experimental result in Tesla P100 achieves a record-setting performance that outperforms the existing fastest integer work with 2x to 3x throughput.

Thank you for the DPF-ECC paper reference. Quick clarification before I invest weeks studying it: the paper reports results on Tesla P100, which has FP64:FP32 ratio of 1:2. My target hardware is RTX 4090 with FP64:FP32 ratio of 1:64 (about 1.29 TFLOPS FP64). Given this 32× FP64 throughput penalty on consumer Ada cards, would you expect the DPF-ECC approach to still outperform optimized integer ECC on RTX 4090, or is this technique only viable on datacenter GPUs (P100/V100/A100/H100)?

Correct. I have not read the paper, but the abstract claims a 2x - 3x advantage from use of FP64 over an integer-based implementation on a data center GPU where FP64 runs at 1/2 the throughput of FP32 / INT32. Now, if instead you were to use the FP64 implementation on a consumer GPU where FP64 runs at a throughput of 1/32 (on older models) or 1/64 (on newer models) of FP32 / INT32 this would be slower than an integer-based implementation by roughly a decimal order of magnitude.

@njuffa the kernel as a whole now shows a clear
INT-pipeline imbalance in the SASS instruction mix:

IMAD (FMA pipe): 3753
IADD3 (INT pipe): 3504
LOP3 (INT pipe): 2279
SHF (INT pipe): 3252

INT total: 9035
FMA total: 3753
Ratio: 2.4 : 1 (INT-heavy)

The Math Pipe Throttle stall (2.23 cycles/inst) confirms the INT
pipeline is the bottleneck, exactly as another forum participant
predicted.

Looking at WHERE the INT operations come from outside of nj_umul256wide():

  • ModSub256: 4× IADD3 carry chain per call (~8000 calls per launch)
  • ShiftR62 in _DivStep62 / _ModInv: pure SHF chains
  • The 32-bit reduction step in _ModMult after nj_umul256wide:
    8× mad.lo + 8× madc.hi + 8× addc carry chain (uses both pipes
    but the addc tail ends in INT)

Specific question: in the 32-bit reduction code I added after
nj_umul256wide() to fold the high 256 bits using the secp256k1
prime form (2^256 ≡ 977 + 2^32 mod p), the relevant chunk looks
like this in PTX:

mul.lo.u32 t0, h0, 977;
mul.hi.u32 t1, h0, 977;
mad.lo.cc.u32 t1, h1, 977, t1;
madc.hi.u32 t2, h1, 977, 0;

// followed by add.cc / addc.cc carry chain across t1..t8

That carry chain is in the INT pipeline. Is there a way to fold the
“add t1, t1, h0” / “addc t2, t2, h1” / etc. step (the second-pass
carry propagation) into the multiplications themselves via
mad.lo.cc.u32 chains, or does the dependency on the prior multiply’s
result fundamentally serialize this?

In other words: I’m asking if the secp256k1-specific reduction
(special-form prime, single multiply by 977 + add high half shifted
by 32) can be restructured so more of it executes on the FMA pipe
instead of the INT pipe.

Even small gains here matter - at 80.7% ALU pipeline saturation,
shifting a few hundred cycles per kernel launch from INT to FMA
could measurably help.

Or is the conclusion just: “subtractions and final carry-merge
chains stay on INT, accept it”?

Without source code to look at, I am having a hard time envisioning what is being described. In general, other than for the odd-even column-shifting addition at the end, an n x nn bit multiplication can be mapped completely into IMAD.WIDE primitives, whereas an n x n2n bit multiplication requires additions for carry propagation besides the IMAD.WIDEs. With plain IMAD.cc without double-wide result, an n x nn bit multiplication requires no separate additions at all, but as we found out earlier, these IMADs require emulation on Ampere.

There are techniques for minimizing carry propagation that use chunks that are smaller than the word size. For example, if the chunks were 28 bits each, then columns of up to 256 partial products could be accumulated by IMAD.WIDE in a 64-bit sum, requiring only the occasional transfer of a multi-bit carry, possibly even just once at the end of the operation. A floating-point variant of this is to use float chunks in combination with double accumulation of partial products and was practically used around the year 2000, if I recall correctly. But these techniques have a bit of additional overhead for the non-word size chunking which make them useful for operands in the thousands of bits (think RSA with 2048-bit keys) rather than operands length in the low hundreds of bits, as we have here.

Here’s the relevant source. The reduction step lives in _ModMult after
the call to your nj_umul256wide(). It takes the upper 256 bits of the
512-bit product and folds them back via the secp256k1 special-form
prime: p = 2^256 - 2^32 - 977, so 2^256 ≡ 977 + 2^32 (mod p).

The fold is: r_low_256 + (r_high_256 * 977) + (r_high_256 << 32)
followed by one extra round of folding in case the addition produced
a top-byte carry.

__device__ __forceinline__ void _ModMult(uint64_t* r_out, uint64_t* a_in, uint64_t* b_in)
{
    uint64_t r512[8];
    ulonglong4 a4 = make_ulonglong4(a_in[0], a_in[1], a_in[2], a_in[3]);
    ulonglong4 b4 = make_ulonglong4(b_in[0], b_in[1], b_in[2], b_in[3]);
    nj_umul256wide(a4, b4, r512);

    // Split high 256 bits into 8 × u32 for 32-bit reduction
    uint32_t high32[8];
    high32[0] = (uint32_t)r512[4];
    high32[1] = (uint32_t)(r512[4] >> 32);
    high32[2] = (uint32_t)r512[5];
    high32[3] = (uint32_t)(r512[5] >> 32);
    high32[4] = (uint32_t)r512[6];
    high32[5] = (uint32_t)(r512[6] >> 32);
    high32[6] = (uint32_t)r512[7];
    high32[7] = (uint32_t)(r512[7] >> 32);

    uint32_t r0 = (uint32_t)r512[0];
    uint32_t r1 = (uint32_t)(r512[0] >> 32);
    uint32_t r2 = (uint32_t)r512[1];
    uint32_t r3 = (uint32_t)(r512[1] >> 32);
    uint32_t r4 = (uint32_t)r512[2];
    uint32_t r5 = (uint32_t)(r512[2] >> 32);
    uint32_t r6 = (uint32_t)r512[3];
    uint32_t r7 = (uint32_t)(r512[3] >> 32);

    asm(
        "{\n\t"
        ".reg .u32 h0,h1,h2,h3,h4,h5,h6,h7;\n\t"
        ".reg .u32 t0,t1,t2,t3,t4,t5,t6,t7,t8;\n\t"
        ".reg .u32 c;\n\t"
        "mov.u32 h0, %8;\n\t"  "mov.u32 h1, %9;\n\t"
        "mov.u32 h2, %10;\n\t" "mov.u32 h3, %11;\n\t"
        "mov.u32 h4, %12;\n\t" "mov.u32 h5, %13;\n\t"
        "mov.u32 h6, %14;\n\t" "mov.u32 h7, %15;\n\t"

        // First pass: t = h * 977  (this part is already FMA-pipe heavy via mad.lo/madc.hi)
        "mul.lo.u32      t0, h0, 977;\n\t"
        "mul.hi.u32      t1, h0, 977;\n\t"
        "mad.lo.cc.u32   t1, h1, 977, t1;\n\t"
        "madc.hi.u32     t2, h1, 977, 0;\n\t"
        "mad.lo.cc.u32   t2, h2, 977, t2;\n\t"
        "madc.hi.u32     t3, h2, 977, 0;\n\t"
        "mad.lo.cc.u32   t3, h3, 977, t3;\n\t"
        "madc.hi.u32     t4, h3, 977, 0;\n\t"
        "mad.lo.cc.u32   t4, h4, 977, t4;\n\t"
        "madc.hi.u32     t5, h4, 977, 0;\n\t"
        "mad.lo.cc.u32   t5, h5, 977, t5;\n\t"
        "madc.hi.u32     t6, h5, 977, 0;\n\t"
        "mad.lo.cc.u32   t6, h6, 977, t6;\n\t"
        "madc.hi.u32     t7, h6, 977, 0;\n\t"
        "mad.lo.cc.u32   t7, h7, 977, t7;\n\t"
        "madc.hi.u32     t8, h7, 977, 0;\n\t"

        // Now add the high half shifted by 32 bits: t += (h << 32)
        // This is the part that ends up purely on the INT pipeline
        "add.cc.u32      t1, t1, h0;\n\t"
        "addc.cc.u32     t2, t2, h1;\n\t"
        "addc.cc.u32     t3, t3, h2;\n\t"
        "addc.cc.u32     t4, t4, h3;\n\t"
        "addc.cc.u32     t5, t5, h4;\n\t"
        "addc.cc.u32     t6, t6, h5;\n\t"
        "addc.cc.u32     t7, t7, h6;\n\t"
        "addc.u32        t8, t8, h7;\n\t"

        // Add the folded high to the low 256 bits
        "add.cc.u32      %0, %0, t0;\n\t"
        "addc.cc.u32     %1, %1, t1;\n\t"
        "addc.cc.u32     %2, %2, t2;\n\t"
        "addc.cc.u32     %3, %3, t3;\n\t"
        "addc.cc.u32     %4, %4, t4;\n\t"
        "addc.cc.u32     %5, %5, t5;\n\t"
        "addc.cc.u32     %6, %6, t6;\n\t"
        "addc.cc.u32     %7, %7, t7;\n\t"
        "addc.u32        t8, t8, 0;\n\t"

        // Second pass: fold any top-word overflow back via *977
        "mul.lo.u32      t0, t8, 977;\n\t"
        "mul.hi.u32      t1, t8, 977;\n\t"
        "add.cc.u32      %0, %0, t0;\n\t"
        "addc.cc.u32     %1, %1, t1;\n\t"
        "addc.u32        c, 0, 0;\n\t"
        "add.cc.u32      %1, %1, t8;\n\t"
        "addc.cc.u32     %2, %2, c;\n\t"
        "addc.cc.u32     %3, %3, 0;\n\t"
        "addc.cc.u32     %4, %4, 0;\n\t"
        "addc.cc.u32     %5, %5, 0;\n\t"
        "addc.cc.u32     %6, %6, 0;\n\t"
        "addc.u32        %7, %7, 0;\n\t"
        "}\n\t"
        : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3),
          "+r"(r4), "+r"(r5), "+r"(r6), "+r"(r7)
        : "r"(high32[0]), "r"(high32[1]), "r"(high32[2]), "r"(high32[3]),
          "r"(high32[4]), "r"(high32[5]), "r"(high32[6]), "r"(high32[7])
    );

    r_out[0] = ((uint64_t)r1 << 32) | r0;
    r_out[1] = ((uint64_t)r3 << 32) | r2;
    r_out[2] = ((uint64_t)r5 << 32) | r4;
    r_out[3] = ((uint64_t)r7 << 32) | r6;
}

The first block (mul.lo + mad.lo.cc / madc.hi.u32 chain doing h × 977)
uses the FMA pipe for the multiplications, with only the carry
propagation between mad.lo.cc and madc.hi.u32 going through INT.

The second block (add.cc.u32 / addc.cc.u32 chain doing t += h_shifted_32)
is pure INT pipe, ~16 sequential addc operations chained together.

The third block (folding final overflow t8 × 977 + carry out) is short
but again ends in an addc chain.

Specific question: is there a way to fold the “t += h_shifted_by_32”
step into mad.lo.cc.u32 chains so it executes on FMA? Conceptually it
would be something like:

mad.lo.cc.u32 t1, h1, 977, t1; // already there
// …could we add h0 here too somehow?

Or does the structure of needing “result = mul_lo + carry_from_prior +
shifted_word” fundamentally require a separate add.cc step on INT?

If this can’t be folded, I’ll accept the 80.7% ALU as the ceiling.
But I wanted to ask before trying restructuring on my own — last time
I tried “obvious” optimizations without checking, I broke compiler
fusion and lost 1.5%.

Note that IMAD.WIDE has half the throughput of IMAD/IMAD.HI and occupies the FMA pipeline for 4 cycles, so your pipeline utilisation ratio should be closer to 1.2 : 1.

What are the reported FMA and ALU pipeline utilisations in Nsight Compute?