compiler segfault on code with many multiplications

nvcc, release 4.2 on Linux, segfaults on the code below. It only fails with “-arch=sm_20” or “-arch=sm_30”. When run with the “–verbose” flag, a call to “cicc” appears to be the last operation before the crash. I was not able to reproduce the problem on a system running release 4.0.

The bug is triggered whether the code is called or not, so for the sake of simplification, I removed all global functions, and main() just does a printf.

% nvcc -arch=sm_20 cicc_crash.cu -o cicc_crash

Segmentation fault (core dumped)

% nvcc -arch=sm_30 cicc_crash.cu -o cicc_crash

Segmentation fault (core dumped)

% nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver

Copyright (c) 2005-2012 NVIDIA Corporation

Built on Thu_Apr__5_00:24:31_PDT_2012

Cuda compilation tools, release 4.2, V0.2.1221

% uname -a

Linux NeoBlue 3.2.0-24-generic #39-Ubuntu SMP Mon May 21 16:52:17 UTC 2012 x86_\

64 x86_64 x86_64 GNU/Linux

% gcc --version

gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
#include <stdio.h>

typedef long long unsigned u64;

__device__  void mult64(u64 a, u64 b, u64 &hi, u64 &lo) {

  lo = a * b;

  hi = __umul64hi(a, b);

}

__device__ void muladd64(u64 a, u64 b, u64 &hi, u64 &lo) {

  u64 tmplo;

  mult64(a, b, hi, tmplo);

  lo += tmplo;

}

struct Int192 {

  u64 part0, part1, part2;

__device__ Int192() {}

__device__ Int192(u64 p2, u64 p1, u64 p0)

    : part0(p0), part1(p1), part2(p2) {}

__device__ static void multiply(Int192 a, Int192 b, Int192 &hi, Int192 &lo) {

    mult64(a.part0, b.part0, lo.part0, lo.part0);

    mult64(a.part0, b.part0, lo.part0, lo.part0);

    mult64(a.part0, b.part0, lo.part0, lo.part0);

    mult64(a.part0, b.part0, hi.part0, lo.part0);

    muladd64(a.part0, b.part0, lo.part0, hi.part0);

  }

__device__ static Int192 mhi(Int192 a, Int192 b) {

    Int192 hi, lo;

    multiply(a, b, hi, lo);

    return hi;

  }

__device__ static Int192 mlo(Int192 a, Int192 b) {

    Int192 r;

    mult64(a.part0, b.part0, r.part1, r.part0);

    muladd64(a.part0, b.part0, r.part1, r.part0);

    muladd64(a.part0, b.part0, r.part1, r.part0);

    return r;

  }

__device__ void add(Int192 x) {

    part0 = x.part0;

  }

__device__ static Int192 manyMults() {

    Int192 x(1,2,3);

    x.add(mhi(x, mlo(x, x)));

    x.add(mhi(x, mlo(x, x)));

    x.add(mhi(x, mlo(x, x)));

    x.add(mhi(x, mlo(x, x)));

    x.add(mhi(x, mlo(x, x)));

    return x;

  }

};

__device__ void doMults() {

  Int192::manyMults();

}

int main(int argc, char **argv) {

  printf("Hello world\n");

  return 0;

}

The compiler should not segfault. Please file a bug, attaching a self-contained repro case. A link to the bug reporting form is on the registered developer website. Thank you very much for your help, and sorry for the inconvenience.

You may want to try the CUDA 5.0 preview available to registered developers to see whether that allows you to make forward progress.