nvcc Segfault

I’m trying to convert a C code to CUDA, and my conversion is causing nvcc to segfault. The offending function is as follows:

[codebox]global void RotateAccumulateKernel( const cuFloatComplex *dJinvJ0df,

				const cuFloatComplex *dJinvJ0dt,

				const float df, const float dt_sec,

				const int *st1, const int *st2,

				const float sha0, const float sdec0,

				const float cha0, const float cdec0,

				const float l, const float m, const float n,

				const float decorrelationConst,

				const float decorrelationTemp,

				const float SourceFactor,

				const float *X, const float *Y, const float *Z,

				const int nStations, const int nTimes,

				const int nBaselines,

				const cuFloatComplex *cv_visdata, const float *cv_invvar,

				cuFloatComplex *bv_visdata ) {

float B_x, B_y, B_z;

float u, v, w;

float FringeRateFactor;

cuFloatComplex PhaseShift;

cuFloatComplex VIS[MAX_POLS], tmp1[MAX_POLS], tmp2[MAX_POLS];

cuFloatComplex tmp3[MAX_POLS], inv[MAX_POLS];

int iBaseline = threadIdx.x + (blockDim.x*blockIdx.x);

const int iTime = 0;

if( iBaseline >= nBaselines ) {

return;

}

int iStation1 = st1[iBaseline];

int iStation2 = st2[iBaseline];

B_x = X[iStation1]-X[iStation2];

B_y = Y[iStation1]-Y[iStation2];

B_z = Z[iStation1]-Z[iStation2];

HADec2uvwGPU( sha0, sdec0, cha0, cdec0, B_x, B_y, B_z, u, v, w);

if ( u == 0 ) {

FringeRateFactor = 1.0;

} else {

FringeRateFactor = ( ( decorrelationConst * u ) / sinf( decorrelationConst * u ) );

FringeRateFactor /= ( ( decorrelationTemp * u ) / sinf( decorrelationTemp * u ) );

}

float temp = C2R_SIGND2PI( ul + vm + w*n );

PhaseShift = make_cuFloatComplex( cosf(temp), sinf(temp) ) * FringeRateFactor * SourceFactor;

for( int i=0; i<MAX_POLS; i++ ) {

VIS[i] = cv_visdata[iBaseline + (iTime*nBaselines) + (i*nTimes*nBaselines)];

VIS[i] *= cv_invvar[iBaseline + (iTime*nBaselines) + (i*nTimes*nBaselines)];

VIS[i] *= PhaseShift;

}

tmp1[0] = 1.0f + dJinvJ0df[0+(iStation1*MAX_POLS)]df + dJinvJ0dt[0+(iStation1MAX_POLS)]*dt_sec;

tmp1[1] = 0.0f + dJinvJ0df[1+(iStation1*MAX_POLS)]df + dJinvJ0dt[1+(iStation1MAX_POLS)]*dt_sec;

tmp1[2] = 0.0f + dJinvJ0df[2+(iStation1*MAX_POLS)]df + dJinvJ0dt[2+(iStation1MAX_POLS)]*dt_sec;

tmp1[3] = 1.0f + dJinvJ0df[3+(iStation1*MAX_POLS)]df + dJinvJ0dt[3+(iStation1MAX_POLS)]*dt_sec;

tmp2[0] = 1.0f + dJinvJ0df[0+(iStation2*MAX_POLS)]df + dJinvJ0dt[0+(iStation2MAX_POLS)]*dt_sec;

tmp2[1] = 0.0f + dJinvJ0df[1+(iStation2*MAX_POLS)]df + dJinvJ0dt[1+(iStation2MAX_POLS)]*dt_sec;

tmp2[2] = 0.0f + dJinvJ0df[2+(iStation2*MAX_POLS)]df + dJinvJ0dt[2+(iStation2MAX_POLS)]*dt_sec;

tmp2[3] = 1.0f + dJinvJ0df[3+(iStation2*MAX_POLS)]df + dJinvJ0dt[3+(iStation2MAX_POLS)]*dt_sec;

for( int i=0; i<MAX_POLS; i++ ) {

tmp2[i] = cuConjf( tmp2[i] );

}

invert2x2( tmp1, inv );

tmp1[0] = inv[0]*VIS[pp] + inv[1]*VIS[qp];

tmp1[1] = inv[0]*VIS[pq] + inv[1]*VIS[qq];

tmp1[2] = inv[2]*VIS[pp] + inv[3]*VIS[qp];

tmp1[3] = inv[2]*VIS[pq] + inv[3]*VIS[qq];

invert2x2( tmp2, inv );

cgemm2x2<NoTrans,NoTrans>( 1, tmp1, inv, tmp3 );

bv_visdata[iBaseline + (iTimenBaselines) + (0nTimes*nBaselines)] += tmp3[0];

bv_visdata[iBaseline + (iTimenBaselines) + (2nTimes*nBaselines)] += tmp3[1];

bv_visdata[iBaseline + (iTimenBaselines) + (3nTimes*nBaselines)] += tmp3[2];

bv_visdata[iBaseline + (iTimenBaselines) + (1nTimes*nBaselines)] += tmp3[3];

}[/codebox]

The value of [font=“Courier New”]MAX_POLS[/font] is 4, and the cgemm2x2 functions are working fine in other kernels, as is [font=“Courier New”]HADec2uvwGPU[/font]. When I try compiling it, the compiler complains:

[font=“Courier New”]/usr/local/cuda/bin/nvcc --compiler-options -fno-strict-aliasing -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK//common//…/common/inc -DUNIX -O3 -I/home/user/prog/src -I/home/user/prog/include -I/usr/local/include/wcslib -I/usr/include/atlas/ -I/usr/include/cfitsio/ -I/home/user/HPlib/include/ -o obj/release/myfile.cu_o -c …/src/myfile.cu

Signal: Segmentation fault in Global Optimization – LPRE: Var phi placement phase.

(0): Error: Signal Segmentation fault in phase Global Optimization – LPRE: Var phi placement – processing aborted

*** Internal stack backtrace:

/usr/local/cuda/open64/lib//be [0x69d232]

/usr/local/cuda/open64/lib//be [0x69df15]

/usr/local/cuda/open64/lib//be [0x69d5ef]

/usr/local/cuda/open64/lib//be [0x69e8c8]

/lib64/libc.so.6 [0x31d8430f30]

/usr/local/cuda/open64/lib//be [0x4dcd9a]

/usr/local/cuda/open64/lib//be [0x52280b]

/usr/local/cuda/open64/lib//be [0x5231d8]

/usr/local/cuda/open64/lib//be [0x50f2ec]

/usr/local/cuda/open64/lib//be [0x50f948]

/usr/local/cuda/open64/lib//be [0x434888]

/usr/local/cuda/open64/lib//be [0x4889da]

/usr/local/cuda/open64/lib//be [0x4181aa]

/usr/local/cuda/open64/lib//be [0x418f81]

/usr/local/cuda/open64/lib//be [0x41a160]

/usr/local/cuda/open64/lib//be [0x41b708]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x31d841e074]

/usr/local/cuda/open64/lib//be [0x41791a]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

[/font]

Now, I can see that this rather large kernel could give the compiler some grief, but a segfault isn’t a very helpful error message <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />

I have found that if I delete the final four lines which store the results, then the code compiles. Also, if I delete some of the other portions of the routine, it will compile. Is this kernel simply too big? This code is running on CUDA 2.0:

[font=“Courier New”]$ nvcc --version

nvcc: NVIDIA ® Cuda compiler driver

Copyright © 2005-2007 NVIDIA Corporation

Built on Thu_Jun_19_03:38:28_PDT_2008

Cuda compilation tools, release 2.0, V0.2.1221[/font]

Any suggestions would be gratefully received.

Does this reproduce with the CUDA_2.1-beta ?

I don’t know - and I don’t have a spare machine to install the beta on, unfortunately.

A follow-up note on this… I came up with a workaround - instead of doing the accumulation in the same kernel, I wrote the results of the last cgemm call to a temporary array, and then had another kernel perform the final addition. I’m guessing that the number of memory accesses involved at the end of the original kernel exceeded some internal limit.

I’m now using a machine which has been upgraded to CUDA 2.1, and another file is now making nvcc segfault. I’m trying to identify the precise problem routine, but the error is:
[font=“Courier New”]/usr/local/cuda/bin/nvcc -D_DEBUG -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK_2.10//common//…/common/inc -DUNIX -g -I/data/user/project/src -I/data/user/project/include -I/usr/local/include/wcslib -I/usr/local/ATLAS/include -I/data/user2/Healpix_2.10/include_GFOR -I/data/user/project/src/correlator -o obj/debug/ftgm.cu.o -c …/src/ftgm.cu
Signal: Segmentation fault in Global Optimization – New PRE: Expr hoisting. phase.
(0): Error: Signal Segmentation fault in phase Global Optimization – New PRE: Expr hoisting. – processing aborted
*** Internal stack backtrace:
/usr/local/cuda/open64/lib//be [0x6ad582]
/usr/local/cuda/open64/lib//be [0x6ae265]
/usr/local/cuda/open64/lib//be [0x6ad93f]
/usr/local/cuda/open64/lib//be [0x6aec18]
/lib64/libc.so.6 [0x3c548301b0]
/usr/local/cuda/open64/lib//be [0x527769]
/usr/local/cuda/open64/lib//be [0x5280f5]
/usr/local/cuda/open64/lib//be [0x4dc3b8]
/usr/local/cuda/open64/lib//be [0x4dc64b]
/usr/local/cuda/open64/lib//be [0x4347a2]
/usr/local/cuda/open64/lib//be [0x48df8a]
/usr/local/cuda/open64/lib//be [0x418682]
/usr/local/cuda/open64/lib//be [0x419451]
/usr/local/cuda/open64/lib//be [0x41a630]
/usr/local/cuda/open64/lib//be [0x41bbd8]
/lib64/libc.so.6(__libc_start_main+0xf4) [0x3c5481d8b4]
/usr/local/cuda/open64/lib//be [0x417dda]
nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4
[/font]
For completeness:
[font=“Courier New”]$ /usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2007 NVIDIA Corporation
Built on Wed_Dec__3_16:25:17_PST_2008
Cuda compilation tools, release 2.1, V0.2.1221
[/font]

This compiled without issue under CUDA 2.0. The file in question is large, and contains several kernels and device functions - I’m going to try isolating the problem routine in a moment… I may be some time. However, suggestions as to what may be causing this error are welcome <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />

Did you ever figure this issue out? My program was working fine under numerous executions, then I simply added

[codebox]include <signal.h>

(void) signal(SIGINT, ex_program);[/codebox]

Then I recieved the same error you have.

Signal: Floating point exception in Code_Expansion phase.

(0): Error: Signal Floating point exception in phase Code_Expansion – processing aborted

*** Internal stack backtrace:

/usr/local/cuda/open64/lib//be [0x6d1c6f]

/usr/local/cuda/open64/lib//be [0x6d28b9]

/usr/local/cuda/open64/lib//be [0x6d200d]

/usr/local/cuda/open64/lib//be [0x6d3256]

/lib64/libc.so.6 [0x35b6630280]

/usr/local/cuda/open64/lib//be [0x592941]

/usr/local/cuda/open64/lib//be [0x592f74]

/usr/local/cuda/open64/lib//be [0x5932ef]

/usr/local/cuda/open64/lib//be [0x565d6e]

/usr/local/cuda/open64/lib//be [0x57085e]

/usr/local/cuda/open64/lib//be [0x571fb1]

/usr/local/cuda/open64/lib//be [0x570b28]

/usr/local/cuda/open64/lib//be [0x57573b]

/usr/local/cuda/open64/lib//be [0x576269]

/usr/local/cuda/open64/lib//be [0x57688c]

/usr/local/cuda/open64/lib//be [0x552722]

/usr/local/cuda/open64/lib//be [0x405443]

/usr/local/cuda/open64/lib//be [0x4061f1]

/usr/local/cuda/open64/lib//be [0x40752d]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x35b661d974]

/usr/local/cuda/open64/lib//be [0x4038da]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

Let me know,

Thanks

Did you ever figure this issue out? My program was working fine under numerous executions, then I simply added

[codebox]include <signal.h>

(void) signal(SIGINT, ex_program);[/codebox]

Then I recieved the same error you have.

Signal: Floating point exception in Code_Expansion phase.

(0): Error: Signal Floating point exception in phase Code_Expansion – processing aborted

*** Internal stack backtrace:

/usr/local/cuda/open64/lib//be [0x6d1c6f]

/usr/local/cuda/open64/lib//be [0x6d28b9]

/usr/local/cuda/open64/lib//be [0x6d200d]

/usr/local/cuda/open64/lib//be [0x6d3256]

/lib64/libc.so.6 [0x35b6630280]

/usr/local/cuda/open64/lib//be [0x592941]

/usr/local/cuda/open64/lib//be [0x592f74]

/usr/local/cuda/open64/lib//be [0x5932ef]

/usr/local/cuda/open64/lib//be [0x565d6e]

/usr/local/cuda/open64/lib//be [0x57085e]

/usr/local/cuda/open64/lib//be [0x571fb1]

/usr/local/cuda/open64/lib//be [0x570b28]

/usr/local/cuda/open64/lib//be [0x57573b]

/usr/local/cuda/open64/lib//be [0x576269]

/usr/local/cuda/open64/lib//be [0x57688c]

/usr/local/cuda/open64/lib//be [0x552722]

/usr/local/cuda/open64/lib//be [0x405443]

/usr/local/cuda/open64/lib//be [0x4061f1]

/usr/local/cuda/open64/lib//be [0x40752d]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x35b661d974]

/usr/local/cuda/open64/lib//be [0x4038da]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

Let me know,

Thanks