Hi,
Whilst compiling Ginkgo and investigating test failures seen on the Blackwell architecture with my RTX 5070 Ti GPU, I think I’ve uncovered a potential bug with register carry over in NVCC. I’ve pasted a self-contained .cu file below which can be used to reproduce the bug.
It would be nice to work out how to debug this further and make sure it really is a either a bug in NVCC or perhaps it’s just a misunderstanding about the best way to structure code.
Apologies if this isn’t quite the best place to post the issue, as it is my first posting.
Thank you
David
// scan_bug_repro.cu
//
// ============================================================
// Standalone CUDA reproduction of a parallel segment-inclusive-prefix-scan
// correctness bug on NVIDIA Blackwell (sm_120) with NVCC 13.2.
//
// BACKGROUND
// Ginkgo's merge-path CSR SpMV kernel uses a device function
// "block_segment_scan_reverse" — a Hillis-Steele parallel inclusive prefix
// sum within same-indexed contiguous segments of a 128-element shared-memory
// array. On sm_120 / NVCC 13.2 this function, marked __device__
// __forceinline__, produces wrong accumulated values. An equivalent serial
// scan (thread 0 walks the array sequentially) gives correct results on the
// same GPU. The function lives in:
// common/cuda_hip/matrix/csr_kernels.template.cpp
// block_segment_scan_reverse()
//
// WHAT THIS FILE DOES
// Two primary code paths (serial vs. parallel) are run on two test
// scenarios and four data types. For debugging, two additional variants of
// the parallel path are provided:
// A — parallel, __forceinline__, #pragma unroll (verbatim Ginkgo)
// B — parallel, __forceinline__, no #pragma unroll
// C — parallel, __noinline__, #pragma unroll
// D — serial reference (known correct)
//
// ── HOW TO COMPILE ────────────────────────────────────────────────────────────
//
// Blackwell (to reproduce the bug):
// nvcc -arch=sm_120 -O2 -std=c++17 -o scan_bug scan_bug_repro.cu
//
// With debug info (add -G, disables most optimisation):
// nvcc -arch=sm_120 -G -std=c++17 -o scan_bug_g scan_bug_repro.cu
//
// Ampere / older GPU (expected to pass on all variants):
// nvcc -arch=sm_80 -O2 -std=c++17 -o scan_bug_sm80 scan_bug_repro.cu
//
// Dump PTX for inspection (look for the bar.sync / __syncthreads codegen):
// nvcc -arch=sm_120 -O2 -std=c++17 --ptx -o scan_bug.ptx scan_bug_repro.cu
//
// Dump SASS (post-link machine code):
// nvcc -arch=sm_120 -O2 -std=c++17 -cubin -o scan_bug.cubin scan_bug_repro.cu
// cuobjdump --dump-sass scan_bug.cubin
//
// ── EXPECTED (correct) OUTPUT ─────────────────────────────────────────────────
// All rows (serial AND parallel) should print PASS.
//
// ── OBSERVED OUTPUT on sm_120 / NVCC 13.2 ────────────────────────────────────
//
// Serial (D) rows: ALL PASS
// Parallel A, B, C rows: ALL FAIL
//
// Scenario 1 (128-element single segment, all val=1.0):
// Expected val[127] = 128 (inclusive prefix sum of 128 ones)
// Observed val[127] = 8
// Note: 8 = 1 + 7 = initial_value + log2(BLOCK_N)
// This means every iteration adds exactly 1 (the original seed value).
// Iteration i reads val[tid-i] = 1 (stale original) instead of the
// accumulated sum written by the previous iteration.
//
// Scenario 2 (multi-segment SpMV carry):
// Expected val[127] = 117
// Observed val[127] = 60
// Verbose dump of segment-0 shows [1,2,3,3,4,4,4,4,5,5,5,5,...] instead of
// [1,2,3,4,5,...,12]. val[3]=3 instead of 4: the i=2 iteration reads
// val[1]=1 (original) not val[1]=2 (result of the i=1 iteration).
// The pattern is identical for all three parallel variants (A, B, C), ruling
// out #pragma unroll and __forceinline__ as sole causes.
//
// ── ROOT CAUSE HYPOTHESIS ─────────────────────────────────────────────────────
//
// The Hillis-Steele body is:
// auto temp = val[threadIdx.x - i]; // read neighbour
// __syncthreads();
// val[threadIdx.x] += temp; // update self
// __syncthreads();
//
// The __restrict__ qualifier on the "val" parameter (a __shared__ pointer)
// tells the compiler the pointer is unaliased. NVCC 13.2 / sm_120 appears to
// perform REGISTER PROMOTION of val[threadIdx.x]: the += is kept in a register
// and no "st.shared" instruction is generated before the barrier. Threads
// that read val[tid-i] therefore see the original (stale) shared memory value
// from the previous store — i.e., every iteration effectively accumulates from
// the initial array rather than from the prior iteration's result.
//
// Evidence:
// val[127] = 1 + 7·1 = 8 (each of 7 iterations contributes only the
// initial value 1, never the running sum)
//
// This is consistent with a missing "st.shared" / "bar.sync" sequence in the
// PTX/SASS. To confirm, disassemble the cubin and look for:
// (correct) ST.SHARED ... [val+tid*4] followed by MEMBAR.CTA
// (buggy) no store to shared before bar.sync
//
// Removing __restrict__ from the val parameter may be sufficient to disable
// register promotion and restore correctness — test this hypothesis by editing
// the scan_A/B/C declarations below.
//
// ── NOTES ─────────────────────────────────────────────────────────────────────
// * The original Ginkgo code uses cooperative_groups::this_thread_block().sync()
// instead of __syncthreads(). Replacing it with bare __syncthreads() does
// NOT fix the bug on sm_120, confirming this is not a cooperative-groups
// compilation issue.
// * All four 32-bit and 64-bit floating-point variants (float, double,
// complex<float>, complex<double>) fail with identical wrong values.
// The bug is not dtype-specific.
// * Parallel variants A (forceinline+unroll), B (forceinline-nounroll), and
// C (noinline+unroll) all produce the same wrong values (8 and 60),
// confirming neither __forceinline__ nor #pragma unroll is the sole trigger.
// * Variant E is identical to A but with __restrict__ removed from "val".
// If E passes while A/B/C fail, __restrict__ on a __shared__ pointer is
// the direct cause. This would confirm a compiler bug: NVCC should not
// allow register promotion of val[threadIdx.x] to eliminate a shared memory
// store before a __syncthreads() barrier, regardless of aliasing hints.
// * The bug was confirmed absent on sm_80 (Ampere A100), only appearing on
// sm_120 (Blackwell RTX 5070 Ti) with NVCC 13.2.
// * Workaround applied in Ginkgo: replaced Hillis-Steele with a serial scan
// executed by thread 0, with explicit __syncthreads() before and after.
// ============================================================
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
// ── Constants ────────────────────────────────────────────────────────────────
// BLOCK_N must be a power of 2. Ginkgo uses 128 (= 4 warps * warp_size).
static constexpr int BLOCK_N = 128;
// ── CUDA error checking ───────────────────────────────────────────────────────
#define CUDA_CHECK(call) \
do { \
cudaError_t _e = (call); \
if (_e != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(_e)); \
exit(1); \
} \
} while (0)
// ============================================================
// SCAN IMPLEMENTATIONS
// ============================================================
// ── Variant A: Parallel Hillis-Steele, __forceinline__, #pragma unroll ────────
// Verbatim copy of Ginkgo's block_segment_scan_reverse.
// __syncthreads() replaces cooperative_groups::this_thread_block().sync()
// (the two are functionally identical; the replacement does not change the
// failure mode).
//
// ind[] : non-decreasing segment labels (shared memory, size BLOCK_N)
// val[] : per-thread values; on return holds inclusive prefix sum per segment
// returns true if this thread is the last in its segment
template <typename T>
__device__ __forceinline__ bool scan_A_parallel_unroll(
const int* __restrict__ ind, T* __restrict__ val)
{
bool last = true;
const int my_row = ind[threadIdx.x];
#pragma unroll
for (int i = 1; i < BLOCK_N; i <<= 1) {
if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
my_row == ind[threadIdx.x + 1]) {
last = false;
}
T temp = T(0);
if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
temp = val[threadIdx.x - i];
}
__syncthreads();
val[threadIdx.x] += temp;
__syncthreads();
}
return last;
}
// ── Variant B: Parallel Hillis-Steele, __forceinline__, NO #pragma unroll ─────
// Does removing unrolling change the behaviour?
template <typename T>
__device__ __forceinline__ bool scan_B_parallel_nounroll(
const int* __restrict__ ind, T* __restrict__ val)
{
bool last = true;
const int my_row = ind[threadIdx.x];
for (int i = 1; i < BLOCK_N; i <<= 1) {
if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
my_row == ind[threadIdx.x + 1]) {
last = false;
}
T temp = T(0);
if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
temp = val[threadIdx.x - i];
}
__syncthreads();
val[threadIdx.x] += temp;
__syncthreads();
}
return last;
}
// ── Variant C: Parallel Hillis-Steele, __noinline__, #pragma unroll ───────────
// Does preventing inlining into the caller change the behaviour?
template <typename T>
__device__ __noinline__ bool scan_C_parallel_noinline(
const int* __restrict__ ind, T* __restrict__ val)
{
bool last = true;
const int my_row = ind[threadIdx.x];
#pragma unroll
for (int i = 1; i < BLOCK_N; i <<= 1) {
if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
my_row == ind[threadIdx.x + 1]) {
last = false;
}
T temp = T(0);
if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
temp = val[threadIdx.x - i];
}
__syncthreads();
val[threadIdx.x] += temp;
__syncthreads();
}
return last;
}
// ── Variant E: Parallel Hillis-Steele, __forceinline__, #pragma unroll ────────
// Identical to Variant A EXCEPT __restrict__ is removed from the "val"
// parameter. Tests the hypothesis that register promotion of val[threadIdx.x]
// (enabled by __restrict__ on a shared pointer) is the root cause of the bug.
// If this variant passes while A fails, __restrict__ is the trigger.
template <typename T>
__device__ __forceinline__ bool scan_E_parallel_norestrict(
const int* __restrict__ ind, T* val) // <-- no __restrict__ on val
{
bool last = true;
const int my_row = ind[threadIdx.x];
#pragma unroll
for (int i = 1; i < BLOCK_N; i <<= 1) {
if (i == 1 && threadIdx.x < BLOCK_N - 1 &&
my_row == ind[threadIdx.x + 1]) {
last = false;
}
T temp = T(0);
if (threadIdx.x >= i && my_row == ind[threadIdx.x - i]) {
temp = val[threadIdx.x - i];
}
__syncthreads();
val[threadIdx.x] += temp;
__syncthreads();
}
return last;
}
// ── Variant D: Serial scan by thread 0 (reference — known correct) ────────────
// Thread 0 walks the array sequentially; all other threads wait.
// Produces correct results on every tested GPU.
template <typename T>
__device__ __forceinline__ bool scan_D_serial(
const int* __restrict__ ind, T* __restrict__ val)
{
const int my_row = ind[threadIdx.x];
bool last = (threadIdx.x >= BLOCK_N - 1) ||
(my_row != ind[threadIdx.x + 1]);
// Note: ind[threadIdx.x + 1] is guarded by (threadIdx.x >= BLOCK_N - 1)
// via short-circuit evaluation of ||. The above is written this way to
// match the original Ginkgo code; the safe form is an explicit if() branch.
__syncthreads();
if (threadIdx.x == 0) {
for (int t = 1; t < BLOCK_N; t++)
if (ind[t] == ind[t - 1]) val[t] += val[t - 1];
}
__syncthreads();
return last;
}
// ============================================================
// KERNEL TEMPLATE
//
// ScanVariant selects which scan implementation to call:
// 0 = serial (D) — reference
// 1 = parallel A — __forceinline__ + #pragma unroll (verbatim Ginkgo)
// 2 = parallel B — __forceinline__ + no unroll
// 3 = parallel C — __noinline__ + #pragma unroll
// 4 = parallel E — __forceinline__ + #pragma unroll, NO __restrict__ on val
//
// Scenario:
// 1 = single segment: all 128 threads share ind=0, val[t]=1
// expected val[127] = 128
// 2 = multi-segment matching Ginkgo SpMV carry block:
// ind[0..117] = t/12 (groups of ~12, rows 0-9)
// ind[118..127] = 99 (the "carry" segment, distinct from others)
// val[118] = 9
// val[119..127] = 12 each
// val[0..117] = 1 (these don't affect the last segment's result)
// expected val[127] = 9 + 9*12 = 117
//
// output[] receives all BLOCK_N post-scan values for inspection.
// ============================================================
template <typename T, int ScanVariant, int Scenario>
__global__ void test_kernel(T* output)
{
__shared__ int s_ind[BLOCK_N];
__shared__ T s_val[BLOCK_N];
const int t = static_cast<int>(threadIdx.x);
// ── Populate shared memory ──────────────────────────────────────────────
if constexpr (Scenario == 1) {
s_ind[t] = 0;
s_val[t] = T(1);
} else {
// Scenario 2 — multi-segment SpMV carry
s_ind[t] = (t < 118) ? (t / 12) : 99;
if (t == 118) s_val[t] = T(9);
else if (t > 118) s_val[t] = T(12);
else s_val[t] = T(1);
}
__syncthreads();
// ── Run selected scan ───────────────────────────────────────────────────
if constexpr (ScanVariant == 0) scan_D_serial<T> (s_ind, s_val);
if constexpr (ScanVariant == 1) scan_A_parallel_unroll<T> (s_ind, s_val);
if constexpr (ScanVariant == 2) scan_B_parallel_nounroll<T> (s_ind, s_val);
if constexpr (ScanVariant == 3) scan_C_parallel_noinline<T> (s_ind, s_val);
if constexpr (ScanVariant == 4) scan_E_parallel_norestrict<T> (s_ind, s_val);
// ── Write all results to global memory ──────────────────────────────────
output[t] = s_val[t];
}
// ============================================================
// HELPERS
// ============================================================
static int g_pass = 0, g_fail = 0;
template <typename T>
static void check(const char* label, T got, T expected)
{
const bool ok = (std::fabs((double)got - (double)expected) < 1e-5);
printf(" %-60s got %10.4f exp %10.4f %s\n",
label, (double)got, (double)expected, ok ? "PASS" : "FAIL ***");
if (ok) ++g_pass; else ++g_fail;
}
// Dump all BLOCK_N post-scan values to stdout (for detailed inspection)
template <typename T>
static void dump_all(const char* label, const T* hbuf)
{
printf(" [%s] full val[] after scan:\n", label);
for (int t = 0; t < BLOCK_N; t++) {
printf(" val[%3d] = %g\n", t, (double)hbuf[t]);
}
}
// ============================================================
// MAIN
// ============================================================
int main(int argc, char* argv[])
{
const bool verbose = (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v');
// Print GPU info
cudaDeviceProp prop{};
CUDA_CHECK(cudaGetDeviceProperties(&prop, 0));
printf("GPU : %s (sm_%d%d)\n", prop.name, prop.major, prop.minor);
printf("BLOCK_N = %d\n\n", BLOCK_N);
// Device buffers
float *d_f32 = nullptr;
double *d_f64 = nullptr;
CUDA_CHECK(cudaMalloc(&d_f32, BLOCK_N * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_f64, BLOCK_N * sizeof(double)));
float h_f32[BLOCK_N];
double h_f64[BLOCK_N];
// Convenience macros: launch, synchronise, copy back
#define RUN_F32(Variant, Scen) \
test_kernel<float, Variant, Scen><<<1, BLOCK_N>>>(d_f32); \
CUDA_CHECK(cudaDeviceSynchronize()); \
CUDA_CHECK(cudaMemcpy(h_f32, d_f32, BLOCK_N*sizeof(float), cudaMemcpyDeviceToHost))
#define RUN_F64(Variant, Scen) \
test_kernel<double, Variant, Scen><<<1, BLOCK_N>>>(d_f64); \
CUDA_CHECK(cudaDeviceSynchronize()); \
CUDA_CHECK(cudaMemcpy(h_f64, d_f64, BLOCK_N*sizeof(double), cudaMemcpyDeviceToHost))
// ── SCENARIO 1: single segment ──────────────────────────────────────────
// ind[t]=0 for all t, val[t]=1 => expected val[127] = 128
printf("══════════════════════════════════════════════════════════════\n");
printf("Scenario 1 — single segment (all ind=0, val=1, expect 128)\n");
printf("══════════════════════════════════════════════════════════════\n");
RUN_F32(0, 1); check("serial float32 val[127]", h_f32[127], 128.0f);
RUN_F32(1, 1); check("parallel A unroll float32 val[127]", h_f32[127], 128.0f);
RUN_F32(2, 1); check("parallel B nounrl float32 val[127]", h_f32[127], 128.0f);
RUN_F32(3, 1); check("parallel C noinln float32 val[127]", h_f32[127], 128.0f);
RUN_F32(4, 1); check("parallel E norestr float32 val[127]", h_f32[127], 128.0f);
RUN_F64(0, 1); check("serial float64 val[127]", h_f64[127], 128.0);
RUN_F64(1, 1); check("parallel A unroll float64 val[127]", h_f64[127], 128.0);
RUN_F64(2, 1); check("parallel B nounrl float64 val[127]", h_f64[127], 128.0);
RUN_F64(3, 1); check("parallel C noinln float64 val[127]", h_f64[127], 128.0);
RUN_F64(4, 1); check("parallel E norestr float64 val[127]", h_f64[127], 128.0);
// ── SCENARIO 2: multi-segment SpMV carry ────────────────────────────────
// Matches block 0 of the Ginkgo 128x128 all-ones SpMV with IPT=12.
// ind[0..117] = t/12 (rows 0-9, 10-12 threads per row)
// ind[118..127] = 99 (the carry segment; row 11 in Ginkgo)
// val[118] = 9 (thread 118 partially processed row 11)
// val[119..127] = 12 (threads 119-127 each processed 12 nnz in row 11)
// val[0..117] = 1 (irrelevant for val[127])
// Expected val[127] = 9 + 9*12 = 117
printf("\n");
printf("══════════════════════════════════════════════════════════════\n");
printf("Scenario 2 — multi-segment SpMV carry (expect val[127]=117)\n");
printf(" ind[0..117]=t/12, ind[118..127]=99\n");
printf(" val[118]=9, val[119..127]=12, val[0..117]=1\n");
printf("══════════════════════════════════════════════════════════════\n");
RUN_F32(0, 2); check("serial float32 val[127]", h_f32[127], 117.0f);
if (verbose) dump_all("serial float32", h_f32);
RUN_F32(1, 2); check("parallel A unroll float32 val[127]", h_f32[127], 117.0f);
if (verbose) dump_all("parallel A float32", h_f32);
RUN_F32(2, 2); check("parallel B nounrl float32 val[127]", h_f32[127], 117.0f);
RUN_F32(3, 2); check("parallel C noinln float32 val[127]", h_f32[127], 117.0f);
RUN_F32(4, 2); check("parallel E norestr float32 val[127]", h_f32[127], 117.0f);
if (verbose) dump_all("parallel E norestr float32", h_f32);
RUN_F64(0, 2); check("serial float64 val[127]", h_f64[127], 117.0);
RUN_F64(1, 2); check("parallel A unroll float64 val[127]", h_f64[127], 117.0);
RUN_F64(2, 2); check("parallel B nounrl float64 val[127]", h_f64[127], 117.0);
RUN_F64(3, 2); check("parallel C noinln float64 val[127]", h_f64[127], 117.0);
RUN_F64(4, 2); check("parallel E norestr float64 val[127]", h_f64[127], 117.0);
// ── Also check some intermediate values in scenario 2 ───────────────────
// For the carry segment (threads 118..127), the correct inclusive prefix
// sums are: 118->9, 119->21, 120->33, 121->45, 122->57,
// 123->69, 124->81, 125->93, 126->105, 127->117
printf("\n");
printf("── Intermediate values in carry segment (scenario 2, float32) ──\n");
RUN_F32(0, 2);
check("serial float32 val[118]", h_f32[118], 9.0f);
check("serial float32 val[122]", h_f32[122], 57.0f);
check("serial float32 val[126]", h_f32[126], 105.0f);
check("serial float32 val[127]", h_f32[127], 117.0f);
RUN_F32(1, 2);
check("parallelA float32 val[118]", h_f32[118], 9.0f);
check("parallelA float32 val[122]", h_f32[122], 57.0f);
check("parallelA float32 val[126]", h_f32[126], 105.0f);
check("parallelA float32 val[127]", h_f32[127], 117.0f);
RUN_F32(4, 2);
check("parallelE float32 val[118]", h_f32[118], 9.0f);
check("parallelE float32 val[122]", h_f32[122], 57.0f);
check("parallelE float32 val[126]", h_f32[126], 105.0f);
check("parallelE float32 val[127]", h_f32[127], 117.0f);
// ── Summary ─────────────────────────────────────────────────────────────
printf("\n");
printf("══════════════════════════════════════════════════════════════\n");
printf("Total: %d PASS, %d FAIL\n", g_pass, g_fail);
printf("══════════════════════════════════════════════════════════════\n");
printf("Run with -v for a full dump of all 128 post-scan values.\n");
CUDA_CHECK(cudaFree(d_f32));
CUDA_CHECK(cudaFree(d_f64));
return g_fail > 0 ? 1 : 0;
}