Potential bug in CUDA 13.2

Hi,

Whilst compiling Ginkgo and investigating test failures seen on the Blackwell architecture with my RTX 5070 Ti GPU, I think I’ve uncovered a potential bug with register carry over in NVCC. I’ve pasted a self-contained .cu file below which can be used to reproduce the bug.

It would be nice to work out how to debug this further and make sure it really is a either a bug in NVCC or perhaps it’s just a misunderstanding about the best way to structure code.

Apologies if this isn’t quite the best place to post the issue, as it is my first posting.

Thank you

David

// scan_bug_repro.cu
//
// ============================================================
// Standalone CUDA reproduction of a parallel segment-inclusive-prefix-scan
// correctness bug on NVIDIA Blackwell (sm_120) with NVCC 13.2.
//
// BACKGROUND
//   Ginkgo's merge-path CSR SpMV kernel uses a device function
//   "block_segment_scan_reverse" — a Hillis-Steele parallel inclusive prefix
//   sum within same-indexed contiguous segments of a 128-element shared-memory
//   array.  On sm_120 / NVCC 13.2 this function, marked __device__
//   __forceinline__, produces wrong accumulated values.  An equivalent serial
//   scan (thread 0 walks the array sequentially) gives correct results on the
//   same GPU.  The function lives in:
//     common/cuda_hip/matrix/csr_kernels.template.cpp
//     block_segment_scan_reverse()
//
// WHAT THIS FILE DOES
//   Two primary code paths (serial vs. parallel) are run on two test
//   scenarios and four data types.  For debugging, two additional variants of
//   the parallel path are provided:
//     A  — parallel, __forceinline__, #pragma unroll  (verbatim Ginkgo)
//     B  — parallel, __forceinline__, no #pragma unroll
//     C  — parallel, __noinline__,   #pragma unroll
//     D  — serial reference (known correct)
//
// ── HOW TO COMPILE ────────────────────────────────────────────────────────────
//
//   Blackwell (to reproduce the bug):
//     nvcc -arch=sm_120 -O2 -std=c++17 -o scan_bug scan_bug_repro.cu
//
//   With debug info (add -G, disables most optimisation):
//     nvcc -arch=sm_120 -G -std=c++17 -o scan_bug_g scan_bug_repro.cu
//
//   Ampere / older GPU (expected to pass on all variants):
//     nvcc -arch=sm_80 -O2 -std=c++17 -o scan_bug_sm80 scan_bug_repro.cu
//
//   Dump PTX for inspection (look for the bar.sync / __syncthreads codegen):
//     nvcc -arch=sm_120 -O2 -std=c++17 --ptx -o scan_bug.ptx scan_bug_repro.cu
//
//   Dump SASS (post-link machine code):
//     nvcc -arch=sm_120 -O2 -std=c++17 -cubin -o scan_bug.cubin scan_bug_repro.cu
//     cuobjdump --dump-sass scan_bug.cubin
//
// ── EXPECTED (correct) OUTPUT ─────────────────────────────────────────────────
//   All rows (serial AND parallel) should print PASS.
//
// ── OBSERVED OUTPUT on sm_120 / NVCC 13.2 ────────────────────────────────────
//
//   Serial (D) rows:               ALL PASS
//   Parallel A, B, C rows:         ALL FAIL
//
//   Scenario 1 (128-element single segment, all val=1.0):
//     Expected val[127] = 128  (inclusive prefix sum of 128 ones)
//     Observed val[127] =   8
//     Note: 8 = 1 + 7 = initial_value + log2(BLOCK_N)
//     This means every iteration adds exactly 1 (the original seed value).
//     Iteration i reads val[tid-i] = 1 (stale original) instead of the
//     accumulated sum written by the previous iteration.
//
//   Scenario 2 (multi-segment SpMV carry):
//     Expected val[127] = 117
//     Observed val[127] =  60
//     Verbose dump of segment-0 shows [1,2,3,3,4,4,4,4,5,5,5,5,...] instead of
//     [1,2,3,4,5,...,12].  val[3]=3 instead of 4: the i=2 iteration reads
//     val[1]=1 (original) not val[1]=2 (result of the i=1 iteration).
//     The pattern is identical for all three parallel variants (A, B, C), ruling
//     out #pragma unroll and __forceinline__ as sole causes.
//
// ── ROOT CAUSE HYPOTHESIS ─────────────────────────────────────────────────────
//
//   The Hillis-Steele body is:
//     auto temp = val[threadIdx.x - i];   // read neighbour
//     __syncthreads();
//     val[threadIdx.x] += temp;           // update self
//     __syncthreads();
//
//   The __restrict__ qualifier on the "val" parameter (a __shared__ pointer)
//   tells the compiler the pointer is unaliased.  NVCC 13.2 / sm_120 appears to
//   perform REGISTER PROMOTION of val[threadIdx.x]: the += is kept in a register
//   and no "st.shared" instruction is generated before the barrier.  Threads
//   that read val[tid-i] therefore see the original (stale) shared memory value
//   from the previous store — i.e., every iteration effectively accumulates from
//   the initial array rather than from the prior iteration's result.
//
//   Evidence:
//     val[127] = 1 + 7·1 = 8   (each of 7 iterations contributes only the
//                                initial value 1, never the running sum)
//
//   This is consistent with a missing "st.shared" / "bar.sync" sequence in the
//   PTX/SASS.  To confirm, disassemble the cubin and look for:
//     (correct)  ST.SHARED ... [val+tid*4]     followed by MEMBAR.CTA
//     (buggy)    no store to shared before bar.sync
//
//   Removing __restrict__ from the val parameter may be sufficient to disable
//   register promotion and restore correctness — test this hypothesis by editing
//   the scan_A/B/C declarations below.
//
// ── NOTES ─────────────────────────────────────────────────────────────────────
//   * The original Ginkgo code uses cooperative_groups::this_thread_block().sync()
//     instead of __syncthreads().  Replacing it with bare __syncthreads() does
//     NOT fix the bug on sm_120, confirming this is not a cooperative-groups
//     compilation issue.
//   * All four 32-bit and 64-bit floating-point variants (float, double,
//     complex<float>, complex<double>) fail with identical wrong values.
//     The bug is not dtype-specific.
//   * Parallel variants A (forceinline+unroll), B (forceinline-nounroll), and
//     C (noinline+unroll) all produce the same wrong values (8 and 60),
//     confirming neither __forceinline__ nor #pragma unroll is the sole trigger.
//   * Variant E is identical to A but with __restrict__ removed from "val".
//     If E passes while A/B/C fail, __restrict__ on a __shared__ pointer is
//     the direct cause.  This would confirm a compiler bug: NVCC should not
//     allow register promotion of val[threadIdx.x] to eliminate a shared memory
//     store before a __syncthreads() barrier, regardless of aliasing hints.
//   * The bug was confirmed absent on sm_80 (Ampere A100), only appearing on
//     sm_120 (Blackwell RTX 5070 Ti) with NVCC 13.2.
//   * Workaround applied in Ginkgo: replaced Hillis-Steele with a serial scan
//     executed by thread 0, with explicit __syncthreads() before and after.
// ============================================================

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>

// ── Constants ────────────────────────────────────────────────────────────────
//   BLOCK_N must be a power of 2.  Ginkgo uses 128 (= 4 warps * warp_size).
static constexpr int BLOCK_N = 128;

// ── CUDA error checking ───────────────────────────────────────────────────────
#define CUDA_CHECK(call)                                                       \
    do {                                                                       \
        cudaError_t _e = (call);                                               \
        if (_e != cudaSuccess) {                                               \
            fprintf(stderr, "CUDA error at %s:%d: %s\n",                      \
                    __FILE__, __LINE__, cudaGetErrorString(_e));                \
            exit(1);                                                           \
        }                                                                      \
    } while (0)

// ============================================================
// SCAN IMPLEMENTATIONS
// ============================================================

// ── Variant A: Parallel Hillis-Steele, __forceinline__, #pragma unroll ────────
//   Verbatim copy of Ginkgo's block_segment_scan_reverse.
//   __syncthreads() replaces cooperative_groups::this_thread_block().sync()
//   (the two are functionally identical; the replacement does not change the
//   failure mode).
//
//   ind[] : non-decreasing segment labels (shared memory, size BLOCK_N)
//   val[] : per-thread values; on return holds inclusive prefix sum per segment
//   returns true if this thread is the last in its segment
template <typename T>
__device__ __forceinline__ bool scan_A_parallel_unroll(
    const int* __restrict__ ind, T* __restrict__ val)
{
    bool last       = true;
    const int my_row = ind[threadIdx.x];
#pragma unroll
    for (int i = 1; i < BLOCK_N; i <<= 1) {
        if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
            my_row == ind[threadIdx.x + 1]) {
            last = false;
        }
        T temp = T(0);
        if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
            temp = val[threadIdx.x - i];
        }
        __syncthreads();
        val[threadIdx.x] += temp;
        __syncthreads();
    }
    return last;
}

// ── Variant B: Parallel Hillis-Steele, __forceinline__, NO #pragma unroll ─────
//   Does removing unrolling change the behaviour?
template <typename T>
__device__ __forceinline__ bool scan_B_parallel_nounroll(
    const int* __restrict__ ind, T* __restrict__ val)
{
    bool last       = true;
    const int my_row = ind[threadIdx.x];
    for (int i = 1; i < BLOCK_N; i <<= 1) {
        if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
            my_row == ind[threadIdx.x + 1]) {
            last = false;
        }
        T temp = T(0);
        if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
            temp = val[threadIdx.x - i];
        }
        __syncthreads();
        val[threadIdx.x] += temp;
        __syncthreads();
    }
    return last;
}

// ── Variant C: Parallel Hillis-Steele, __noinline__, #pragma unroll ───────────
//   Does preventing inlining into the caller change the behaviour?
template <typename T>
__device__ __noinline__ bool scan_C_parallel_noinline(
    const int* __restrict__ ind, T* __restrict__ val)
{
    bool last       = true;
    const int my_row = ind[threadIdx.x];
#pragma unroll
    for (int i = 1; i < BLOCK_N; i <<= 1) {
        if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
            my_row == ind[threadIdx.x + 1]) {
            last = false;
        }
        T temp = T(0);
        if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
            temp = val[threadIdx.x - i];
        }
        __syncthreads();
        val[threadIdx.x] += temp;
        __syncthreads();
    }
    return last;
}

// ── Variant E: Parallel Hillis-Steele, __forceinline__, #pragma unroll ────────
//   Identical to Variant A EXCEPT __restrict__ is removed from the "val"
//   parameter.  Tests the hypothesis that register promotion of val[threadIdx.x]
//   (enabled by __restrict__ on a shared pointer) is the root cause of the bug.
//   If this variant passes while A fails, __restrict__ is the trigger.
template <typename T>
__device__ __forceinline__ bool scan_E_parallel_norestrict(
    const int* __restrict__ ind, T* val)   // <-- no __restrict__ on val
{
    bool last       = true;
    const int my_row = ind[threadIdx.x];
#pragma unroll
    for (int i = 1; i < BLOCK_N; i <<= 1) {
        if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
            my_row == ind[threadIdx.x + 1]) {
            last = false;
        }
        T temp = T(0);
        if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
            temp = val[threadIdx.x - i];
        }
        __syncthreads();
        val[threadIdx.x] += temp;
        __syncthreads();
    }
    return last;
}

// ── Variant D: Serial scan by thread 0 (reference — known correct) ────────────
//   Thread 0 walks the array sequentially; all other threads wait.
//   Produces correct results on every tested GPU.
template <typename T>
__device__ __forceinline__ bool scan_D_serial(
    const int* __restrict__ ind, T* __restrict__ val)
{
    const int my_row = ind[threadIdx.x];
    bool last = (threadIdx.x >= BLOCK_N - 1) ||
                (my_row != ind[threadIdx.x + 1]);

    // Note: ind[threadIdx.x + 1] is guarded by (threadIdx.x >= BLOCK_N - 1)
    // via short-circuit evaluation of ||.  The above is written this way to
    // match the original Ginkgo code; the safe form is an explicit if() branch.

    __syncthreads();
    if (threadIdx.x == 0) {
        for (int t = 1; t < BLOCK_N; t++)
            if (ind[t] == ind[t - 1]) val[t] += val[t - 1];
    }
    __syncthreads();
    return last;
}

// ============================================================
// KERNEL TEMPLATE
//
// ScanVariant selects which scan implementation to call:
//   0 = serial (D)  — reference
//   1 = parallel A  — __forceinline__ + #pragma unroll   (verbatim Ginkgo)
//   2 = parallel B  — __forceinline__ + no unroll
//   3 = parallel C  — __noinline__    + #pragma unroll
//   4 = parallel E  — __forceinline__ + #pragma unroll, NO __restrict__ on val
//
// Scenario:
//   1 = single segment: all 128 threads share ind=0, val[t]=1
//       expected val[127] = 128
//   2 = multi-segment matching Ginkgo SpMV carry block:
//       ind[0..117] = t/12  (groups of ~12, rows 0-9)
//       ind[118..127] = 99  (the "carry" segment, distinct from others)
//       val[118]      = 9
//       val[119..127] = 12 each
//       val[0..117]   = 1   (these don't affect the last segment's result)
//       expected val[127] = 9 + 9*12 = 117
//
// output[] receives all BLOCK_N post-scan values for inspection.
// ============================================================
template <typename T, int ScanVariant, int Scenario>
__global__ void test_kernel(T* output)
{
    __shared__ int s_ind[BLOCK_N];
    __shared__ T   s_val[BLOCK_N];

    const int t = static_cast<int>(threadIdx.x);

    // ── Populate shared memory ──────────────────────────────────────────────
    if constexpr (Scenario == 1) {
        s_ind[t] = 0;
        s_val[t] = T(1);
    } else {
        // Scenario 2 — multi-segment SpMV carry
        s_ind[t] = (t < 118) ? (t / 12) : 99;
        if      (t == 118) s_val[t] = T(9);
        else if (t >  118) s_val[t] = T(12);
        else               s_val[t] = T(1);
    }
    __syncthreads();

    // ── Run selected scan ───────────────────────────────────────────────────
    if constexpr (ScanVariant == 0) scan_D_serial<T>              (s_ind, s_val);
    if constexpr (ScanVariant == 1) scan_A_parallel_unroll<T>     (s_ind, s_val);
    if constexpr (ScanVariant == 2) scan_B_parallel_nounroll<T>   (s_ind, s_val);
    if constexpr (ScanVariant == 3) scan_C_parallel_noinline<T>   (s_ind, s_val);
    if constexpr (ScanVariant == 4) scan_E_parallel_norestrict<T> (s_ind, s_val);

    // ── Write all results to global memory ──────────────────────────────────
    output[t] = s_val[t];
}

// ============================================================
// HELPERS
// ============================================================

static int g_pass = 0, g_fail = 0;

template <typename T>
static void check(const char* label, T got, T expected)
{
    const bool ok = (std::fabs((double)got - (double)expected) < 1e-5);
    printf("  %-60s  got %10.4f  exp %10.4f  %s\n",
           label, (double)got, (double)expected, ok ? "PASS" : "FAIL ***");
    if (ok) ++g_pass; else ++g_fail;
}

// Dump all BLOCK_N post-scan values to stdout (for detailed inspection)
template <typename T>
static void dump_all(const char* label, const T* hbuf)
{
    printf("  [%s] full val[] after scan:\n", label);
    for (int t = 0; t < BLOCK_N; t++) {
        printf("    val[%3d] = %g\n", t, (double)hbuf[t]);
    }
}

// ============================================================
// MAIN
// ============================================================
int main(int argc, char* argv[])
{
    const bool verbose = (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v');

    // Print GPU info
    cudaDeviceProp prop{};
    CUDA_CHECK(cudaGetDeviceProperties(&prop, 0));
    printf("GPU : %s  (sm_%d%d)\n", prop.name, prop.major, prop.minor);
    printf("BLOCK_N = %d\n\n", BLOCK_N);

    // Device buffers
    float  *d_f32 = nullptr;
    double *d_f64 = nullptr;
    CUDA_CHECK(cudaMalloc(&d_f32, BLOCK_N * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_f64, BLOCK_N * sizeof(double)));

    float  h_f32[BLOCK_N];
    double h_f64[BLOCK_N];

    // Convenience macros: launch, synchronise, copy back
#define RUN_F32(Variant, Scen) \
    test_kernel<float,  Variant, Scen><<<1, BLOCK_N>>>(d_f32); \
    CUDA_CHECK(cudaDeviceSynchronize()); \
    CUDA_CHECK(cudaMemcpy(h_f32, d_f32, BLOCK_N*sizeof(float),  cudaMemcpyDeviceToHost))

#define RUN_F64(Variant, Scen) \
    test_kernel<double, Variant, Scen><<<1, BLOCK_N>>>(d_f64); \
    CUDA_CHECK(cudaDeviceSynchronize()); \
    CUDA_CHECK(cudaMemcpy(h_f64, d_f64, BLOCK_N*sizeof(double), cudaMemcpyDeviceToHost))

    // ── SCENARIO 1: single segment ──────────────────────────────────────────
    // ind[t]=0 for all t, val[t]=1  =>  expected val[127] = 128
    printf("══════════════════════════════════════════════════════════════\n");
    printf("Scenario 1 — single segment (all ind=0, val=1, expect 128)\n");
    printf("══════════════════════════════════════════════════════════════\n");

    RUN_F32(0, 1); check("serial              float32  val[127]", h_f32[127], 128.0f);
    RUN_F32(1, 1); check("parallel A unroll   float32  val[127]", h_f32[127], 128.0f);
    RUN_F32(2, 1); check("parallel B nounrl   float32  val[127]", h_f32[127], 128.0f);
    RUN_F32(3, 1); check("parallel C noinln   float32  val[127]", h_f32[127], 128.0f);
    RUN_F32(4, 1); check("parallel E norestr  float32  val[127]", h_f32[127], 128.0f);

    RUN_F64(0, 1); check("serial              float64  val[127]", h_f64[127], 128.0);
    RUN_F64(1, 1); check("parallel A unroll   float64  val[127]", h_f64[127], 128.0);
    RUN_F64(2, 1); check("parallel B nounrl   float64  val[127]", h_f64[127], 128.0);
    RUN_F64(3, 1); check("parallel C noinln   float64  val[127]", h_f64[127], 128.0);
    RUN_F64(4, 1); check("parallel E norestr  float64  val[127]", h_f64[127], 128.0);

    // ── SCENARIO 2: multi-segment SpMV carry ────────────────────────────────
    // Matches block 0 of the Ginkgo 128x128 all-ones SpMV with IPT=12.
    //   ind[0..117] = t/12   (rows 0-9, 10-12 threads per row)
    //   ind[118..127] = 99   (the carry segment; row 11 in Ginkgo)
    //   val[118]      = 9    (thread 118 partially processed row 11)
    //   val[119..127] = 12   (threads 119-127 each processed 12 nnz in row 11)
    //   val[0..117]   = 1    (irrelevant for val[127])
    // Expected val[127] = 9 + 9*12 = 117
    printf("\n");
    printf("══════════════════════════════════════════════════════════════\n");
    printf("Scenario 2 — multi-segment SpMV carry (expect val[127]=117)\n");
    printf("  ind[0..117]=t/12, ind[118..127]=99\n");
    printf("  val[118]=9, val[119..127]=12, val[0..117]=1\n");
    printf("══════════════════════════════════════════════════════════════\n");

    RUN_F32(0, 2); check("serial              float32  val[127]", h_f32[127], 117.0f);
    if (verbose) dump_all("serial float32", h_f32);

    RUN_F32(1, 2); check("parallel A unroll   float32  val[127]", h_f32[127], 117.0f);
    if (verbose) dump_all("parallel A float32", h_f32);

    RUN_F32(2, 2); check("parallel B nounrl   float32  val[127]", h_f32[127], 117.0f);
    RUN_F32(3, 2); check("parallel C noinln   float32  val[127]", h_f32[127], 117.0f);
    RUN_F32(4, 2); check("parallel E norestr  float32  val[127]", h_f32[127], 117.0f);
    if (verbose) dump_all("parallel E norestr float32", h_f32);

    RUN_F64(0, 2); check("serial              float64  val[127]", h_f64[127], 117.0);
    RUN_F64(1, 2); check("parallel A unroll   float64  val[127]", h_f64[127], 117.0);
    RUN_F64(2, 2); check("parallel B nounrl   float64  val[127]", h_f64[127], 117.0);
    RUN_F64(3, 2); check("parallel C noinln   float64  val[127]", h_f64[127], 117.0);
    RUN_F64(4, 2); check("parallel E norestr  float64  val[127]", h_f64[127], 117.0);

    // ── Also check some intermediate values in scenario 2 ───────────────────
    // For the carry segment (threads 118..127), the correct inclusive prefix
    // sums are:  118->9, 119->21, 120->33, 121->45, 122->57,
    //            123->69, 124->81, 125->93, 126->105, 127->117
    printf("\n");
    printf("── Intermediate values in carry segment (scenario 2, float32) ──\n");
    RUN_F32(0, 2);
    check("serial  float32  val[118]", h_f32[118],   9.0f);
    check("serial  float32  val[122]", h_f32[122],  57.0f);
    check("serial  float32  val[126]", h_f32[126], 105.0f);
    check("serial  float32  val[127]", h_f32[127], 117.0f);

    RUN_F32(1, 2);
    check("parallelA float32 val[118]", h_f32[118],   9.0f);
    check("parallelA float32 val[122]", h_f32[122],  57.0f);
    check("parallelA float32 val[126]", h_f32[126], 105.0f);
    check("parallelA float32 val[127]", h_f32[127], 117.0f);

    RUN_F32(4, 2);
    check("parallelE float32 val[118]", h_f32[118],   9.0f);
    check("parallelE float32 val[122]", h_f32[122],  57.0f);
    check("parallelE float32 val[126]", h_f32[126], 105.0f);
    check("parallelE float32 val[127]", h_f32[127], 117.0f);

    // ── Summary ─────────────────────────────────────────────────────────────
    printf("\n");
    printf("══════════════════════════════════════════════════════════════\n");
    printf("Total: %d PASS, %d FAIL\n", g_pass, g_fail);
    printf("══════════════════════════════════════════════════════════════\n");
    printf("Run with -v for a full dump of all 128 post-scan values.\n");

    CUDA_CHECK(cudaFree(d_f32));
    CUDA_CHECK(cudaFree(d_f64));
    return g_fail > 0 ? 1 : 0;
}