nvcc, release 4.2 on Linux, segfaults on the code below. It only fails with “-arch=sm_20” or “-arch=sm_30”. When run with the “–verbose” flag, a call to “cicc” appears to be the last operation before the crash. I was not able to reproduce the problem on a system running release 4.0.
The bug is triggered whether the code is called or not, so for the sake of simplification, I removed all global functions, and main() just does a printf.
% nvcc -arch=sm_20 cicc_crash.cu -o cicc_crash
Segmentation fault (core dumped)
% nvcc -arch=sm_30 cicc_crash.cu -o cicc_crash
Segmentation fault (core dumped)
% nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2012 NVIDIA Corporation
Built on Thu_Apr__5_00:24:31_PDT_2012
Cuda compilation tools, release 4.2, V0.2.1221
% uname -a
Linux NeoBlue 3.2.0-24-generic #39-Ubuntu SMP Mon May 21 16:52:17 UTC 2012 x86_\
64 x86_64 x86_64 GNU/Linux
% gcc --version
gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
#include <stdio.h>
typedef long long unsigned u64;
__device__ void mult64(u64 a, u64 b, u64 &hi, u64 &lo) {
lo = a * b;
hi = __umul64hi(a, b);
}
__device__ void muladd64(u64 a, u64 b, u64 &hi, u64 &lo) {
u64 tmplo;
mult64(a, b, hi, tmplo);
lo += tmplo;
}
struct Int192 {
u64 part0, part1, part2;
__device__ Int192() {}
__device__ Int192(u64 p2, u64 p1, u64 p0)
: part0(p0), part1(p1), part2(p2) {}
__device__ static void multiply(Int192 a, Int192 b, Int192 &hi, Int192 &lo) {
mult64(a.part0, b.part0, lo.part0, lo.part0);
mult64(a.part0, b.part0, lo.part0, lo.part0);
mult64(a.part0, b.part0, lo.part0, lo.part0);
mult64(a.part0, b.part0, hi.part0, lo.part0);
muladd64(a.part0, b.part0, lo.part0, hi.part0);
}
__device__ static Int192 mhi(Int192 a, Int192 b) {
Int192 hi, lo;
multiply(a, b, hi, lo);
return hi;
}
__device__ static Int192 mlo(Int192 a, Int192 b) {
Int192 r;
mult64(a.part0, b.part0, r.part1, r.part0);
muladd64(a.part0, b.part0, r.part1, r.part0);
muladd64(a.part0, b.part0, r.part1, r.part0);
return r;
}
__device__ void add(Int192 x) {
part0 = x.part0;
}
__device__ static Int192 manyMults() {
Int192 x(1,2,3);
x.add(mhi(x, mlo(x, x)));
x.add(mhi(x, mlo(x, x)));
x.add(mhi(x, mlo(x, x)));
x.add(mhi(x, mlo(x, x)));
x.add(mhi(x, mlo(x, x)));
return x;
}
};
__device__ void doMults() {
Int192::manyMults();
}
int main(int argc, char **argv) {
printf("Hello world\n");
return 0;
}