nvcc Segfault

YDD · December 5, 2008, 2:57pm

I’m trying to convert a C code to CUDA, and my conversion is causing nvcc to segfault. The offending function is as follows:

[codebox]global void RotateAccumulateKernel( const cuFloatComplex *dJinvJ0df,

				const cuFloatComplex *dJinvJ0dt,

				const float df, const float dt_sec,

				const int *st1, const int *st2,

				const float sha0, const float sdec0,

				const float cha0, const float cdec0,

				const float l, const float m, const float n,

				const float decorrelationConst,

				const float decorrelationTemp,

				const float SourceFactor,

				const float *X, const float *Y, const float *Z,

				const int nStations, const int nTimes,

				const int nBaselines,

				const cuFloatComplex *cv_visdata, const float *cv_invvar,

				cuFloatComplex *bv_visdata ) {

float B_x, B_y, B_z;

float u, v, w;

float FringeRateFactor;

cuFloatComplex PhaseShift;

cuFloatComplex VIS[MAX_POLS], tmp1[MAX_POLS], tmp2[MAX_POLS];

cuFloatComplex tmp3[MAX_POLS], inv[MAX_POLS];

int iBaseline = threadIdx.x + (blockDim.x*blockIdx.x);

const int iTime = 0;

if( iBaseline >= nBaselines ) {

return;

}

int iStation1 = st1[iBaseline];

int iStation2 = st2[iBaseline];

B_x = X[iStation1]-X[iStation2];

B_y = Y[iStation1]-Y[iStation2];

B_z = Z[iStation1]-Z[iStation2];

HADec2uvwGPU( sha0, sdec0, cha0, cdec0, B_x, B_y, B_z, u, v, w);

if ( u == 0 ) {

FringeRateFactor = 1.0;

} else {

FringeRateFactor = ( ( decorrelationConst * u ) / sinf( decorrelationConst * u ) );

FringeRateFactor /= ( ( decorrelationTemp * u ) / sinf( decorrelationTemp * u ) );

}

float temp = C2R_SIGND2PI( ul + vm + w*n );

PhaseShift = make_cuFloatComplex( cosf(temp), sinf(temp) ) * FringeRateFactor * SourceFactor;

for( int i=0; i<MAX_POLS; i++ ) {

VIS[i] = cv_visdata[iBaseline + (iTime*nBaselines) + (i*nTimes*nBaselines)];

VIS[i] *= cv_invvar[iBaseline + (iTime*nBaselines) + (i*nTimes*nBaselines)];

VIS[i] *= PhaseShift;

}

tmp1[0] = 1.0f + dJinvJ0df[0+(iStation1*MAX_POLS)]df + dJinvJ0dt[0+(iStation1MAX_POLS)]*dt_sec;

tmp1[1] = 0.0f + dJinvJ0df[1+(iStation1*MAX_POLS)]df + dJinvJ0dt[1+(iStation1MAX_POLS)]*dt_sec;

tmp1[2] = 0.0f + dJinvJ0df[2+(iStation1*MAX_POLS)]df + dJinvJ0dt[2+(iStation1MAX_POLS)]*dt_sec;

tmp1[3] = 1.0f + dJinvJ0df[3+(iStation1*MAX_POLS)]df + dJinvJ0dt[3+(iStation1MAX_POLS)]*dt_sec;

tmp2[0] = 1.0f + dJinvJ0df[0+(iStation2*MAX_POLS)]df + dJinvJ0dt[0+(iStation2MAX_POLS)]*dt_sec;

tmp2[1] = 0.0f + dJinvJ0df[1+(iStation2*MAX_POLS)]df + dJinvJ0dt[1+(iStation2MAX_POLS)]*dt_sec;

tmp2[2] = 0.0f + dJinvJ0df[2+(iStation2*MAX_POLS)]df + dJinvJ0dt[2+(iStation2MAX_POLS)]*dt_sec;

tmp2[3] = 1.0f + dJinvJ0df[3+(iStation2*MAX_POLS)]df + dJinvJ0dt[3+(iStation2MAX_POLS)]*dt_sec;

for( int i=0; i<MAX_POLS; i++ ) {

tmp2[i] = cuConjf( tmp2[i] );

}

invert2x2( tmp1, inv );

tmp1[0] = inv[0]*VIS[pp] + inv[1]*VIS[qp];

tmp1[1] = inv[0]*VIS[pq] + inv[1]*VIS[qq];

tmp1[2] = inv[2]*VIS[pp] + inv[3]*VIS[qp];

tmp1[3] = inv[2]*VIS[pq] + inv[3]*VIS[qq];

invert2x2( tmp2, inv );

cgemm2x2<NoTrans,NoTrans>( 1, tmp1, inv, tmp3 );

bv_visdata[iBaseline + (iTimenBaselines) + (0nTimes*nBaselines)] += tmp3[0];

bv_visdata[iBaseline + (iTimenBaselines) + (2nTimes*nBaselines)] += tmp3[1];

bv_visdata[iBaseline + (iTimenBaselines) + (3nTimes*nBaselines)] += tmp3[2];

bv_visdata[iBaseline + (iTimenBaselines) + (1nTimes*nBaselines)] += tmp3[3];

}[/codebox]

The value of [font=“Courier New”]MAX_POLS[/font] is 4, and the cgemm2x2 functions are working fine in other kernels, as is [font=“Courier New”]HADec2uvwGPU[/font]. When I try compiling it, the compiler complains:

[font=“Courier New”]/usr/local/cuda/bin/nvcc --compiler-options -fno-strict-aliasing -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK//common//…/common/inc -DUNIX -O3 -I/home/user/prog/src -I/home/user/prog/include -I/usr/local/include/wcslib -I/usr/include/atlas/ -I/usr/include/cfitsio/ -I/home/user/HPlib/include/ -o obj/release/myfile.cu_o -c …/src/myfile.cu

Signal: Segmentation fault in Global Optimization – LPRE: Var phi placement phase.

(0): Error: Signal Segmentation fault in phase Global Optimization – LPRE: Var phi placement – processing aborted

*** Internal stack backtrace:

/usr/local/cuda/open64/lib//be [0x69d232]

/usr/local/cuda/open64/lib//be [0x69df15]

/usr/local/cuda/open64/lib//be [0x69d5ef]

/usr/local/cuda/open64/lib//be [0x69e8c8]

/lib64/libc.so.6 [0x31d8430f30]

/usr/local/cuda/open64/lib//be [0x4dcd9a]

/usr/local/cuda/open64/lib//be [0x52280b]

/usr/local/cuda/open64/lib//be [0x5231d8]

/usr/local/cuda/open64/lib//be [0x50f2ec]

/usr/local/cuda/open64/lib//be [0x50f948]

/usr/local/cuda/open64/lib//be [0x434888]

/usr/local/cuda/open64/lib//be [0x4889da]

/usr/local/cuda/open64/lib//be [0x4181aa]

/usr/local/cuda/open64/lib//be [0x418f81]

/usr/local/cuda/open64/lib//be [0x41a160]

/usr/local/cuda/open64/lib//be [0x41b708]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x31d841e074]

/usr/local/cuda/open64/lib//be [0x41791a]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

[/font]

Now, I can see that this rather large kernel could give the compiler some grief, but a segfault isn’t a very helpful error message <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />

I have found that if I delete the final four lines which store the results, then the code compiles. Also, if I delete some of the other portions of the routine, it will compile. Is this kernel simply too big? This code is running on CUDA 2.0:

[font=“Courier New”]$ nvcc --version

nvcc: NVIDIA ® Cuda compiler driver

Built on Thu_Jun_19_03:38:28_PDT_2008

Cuda compilation tools, release 2.0, V0.2.1221[/font]

Any suggestions would be gratefully received.

netllama · December 5, 2008, 3:05pm

Does this reproduce with the CUDA_2.1-beta ?

YDD · December 5, 2008, 3:10pm

I don’t know - and I don’t have a spare machine to install the beta on, unfortunately.

YDD · December 8, 2008, 8:48pm

A follow-up note on this… I came up with a workaround - instead of doing the accumulation in the same kernel, I wrote the results of the last cgemm call to a temporary array, and then had another kernel perform the final addition. I’m guessing that the number of memory accesses involved at the end of the original kernel exceeded some internal limit.

YDD · February 3, 2009, 4:52pm

I’m now using a machine which has been upgraded to CUDA 2.1, and another file is now making nvcc segfault. I’m trying to identify the precise problem routine, but the error is:
[font=“Courier New”]/usr/local/cuda/bin/nvcc -D_DEBUG -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK_2.10//common//…/common/inc -DUNIX -g -I/data/user/project/src -I/data/user/project/include -I/usr/local/include/wcslib -I/usr/local/ATLAS/include -I/data/user2/Healpix_2.10/include_GFOR -I/data/user/project/src/correlator -o obj/debug/ftgm.cu.o -c …/src/ftgm.cu
Signal: Segmentation fault in Global Optimization – New PRE: Expr hoisting. phase.
(0): Error: Signal Segmentation fault in phase Global Optimization – New PRE: Expr hoisting. – processing aborted
*** Internal stack backtrace:
/usr/local/cuda/open64/lib//be [0x6ad582]
/usr/local/cuda/open64/lib//be [0x6ae265]
/usr/local/cuda/open64/lib//be [0x6ad93f]
/usr/local/cuda/open64/lib//be [0x6aec18]
/lib64/libc.so.6 [0x3c548301b0]
/usr/local/cuda/open64/lib//be [0x527769]
/usr/local/cuda/open64/lib//be [0x5280f5]
/usr/local/cuda/open64/lib//be [0x4dc3b8]
/usr/local/cuda/open64/lib//be [0x4dc64b]
/usr/local/cuda/open64/lib//be [0x4347a2]
/usr/local/cuda/open64/lib//be [0x48df8a]
/usr/local/cuda/open64/lib//be [0x418682]
/usr/local/cuda/open64/lib//be [0x419451]
/usr/local/cuda/open64/lib//be [0x41a630]
/usr/local/cuda/open64/lib//be [0x41bbd8]
/lib64/libc.so.6(__libc_start_main+0xf4) [0x3c5481d8b4]
/usr/local/cuda/open64/lib//be [0x417dda]
nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4
[/font]
For completeness:
[font=“Courier New”]$ /usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2007 NVIDIA Corporation
Built on Wed_Dec__3_16:25:17_PST_2008
Cuda compilation tools, release 2.1, V0.2.1221
[/font]

This compiled without issue under CUDA 2.0. The file in question is large, and contains several kernels and device functions - I’m going to try isolating the problem routine in a moment… I may be some time. However, suggestions as to what may be causing this error are welcome <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />

shreddn · October 14, 2010, 5:30pm

I’m now using a machine which has been upgraded to CUDA 2.1, and another file is now making nvcc segfault. I’m trying to identify the precise problem routine, but the error is:

[font=“Courier New”]/usr/local/cuda/bin/nvcc -D_DEBUG -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK_2.10//common//…/common/inc -DUNIX -g -I/data/user/project/src -I/data/user/project/include -I/usr/local/include/wcslib -I/usr/local/ATLAS/include -I/data/user2/Healpix_2.10/include_GFOR -I/data/user/project/src/correlator -o obj/debug/ftgm.cu.o -c …/src/ftgm.cu

Signal: Segmentation fault in Global Optimization – New PRE: Expr hoisting. phase.

(0): Error: Signal Segmentation fault in phase Global Optimization – New PRE: Expr hoisting. – processing aborted

*** Internal stack backtrace:
/usr/local/cuda/open64/lib//be [0x6ad582]

/usr/local/cuda/open64/lib//be [0x6ae265]

/usr/local/cuda/open64/lib//be [0x6ad93f]

/usr/local/cuda/open64/lib//be [0x6aec18]

/lib64/libc.so.6 [0x3c548301b0]

/usr/local/cuda/open64/lib//be [0x527769]

/usr/local/cuda/open64/lib//be [0x5280f5]

/usr/local/cuda/open64/lib//be [0x4dc3b8]

/usr/local/cuda/open64/lib//be [0x4dc64b]

/usr/local/cuda/open64/lib//be [0x4347a2]

/usr/local/cuda/open64/lib//be [0x48df8a]

/usr/local/cuda/open64/lib//be [0x418682]

/usr/local/cuda/open64/lib//be [0x419451]

/usr/local/cuda/open64/lib//be [0x41a630]

/usr/local/cuda/open64/lib//be [0x41bbd8]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x3c5481d8b4]

/usr/local/cuda/open64/lib//be [0x417dda]
nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

[/font]

For completeness:

[font=“Courier New”]$ /usr/local/cuda/bin/nvcc --version

nvcc: NVIDIA Â® Cuda compiler driver

Copyright Â© 2005-2007 NVIDIA Corporation

Built on Wed_Dec__3_16:25:17_PST_2008

Cuda compilation tools, release 2.1, V0.2.1221

[/font]

This compiled without issue under CUDA 2.0. The file in question is large, and contains several kernels and device functions - I’m going to try isolating the problem routine in a moment… I may be some time. However, suggestions as to what may be causing this error are welcome <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />

Did you ever figure this issue out? My program was working fine under numerous executions, then I simply added

[codebox]include <signal.h>

…

(void) signal(SIGINT, ex_program);[/codebox]

Then I recieved the same error you have.

Signal: Floating point exception in Code_Expansion phase.

(0): Error: Signal Floating point exception in phase Code_Expansion – processing aborted

*** Internal stack backtrace:

/usr/local/cuda/open64/lib//be [0x6d1c6f]

/usr/local/cuda/open64/lib//be [0x6d28b9]

/usr/local/cuda/open64/lib//be [0x6d200d]

/usr/local/cuda/open64/lib//be [0x6d3256]

/lib64/libc.so.6 [0x35b6630280]

/usr/local/cuda/open64/lib//be [0x592941]

/usr/local/cuda/open64/lib//be [0x592f74]

/usr/local/cuda/open64/lib//be [0x5932ef]

/usr/local/cuda/open64/lib//be [0x565d6e]

/usr/local/cuda/open64/lib//be [0x57085e]

/usr/local/cuda/open64/lib//be [0x571fb1]

/usr/local/cuda/open64/lib//be [0x570b28]

/usr/local/cuda/open64/lib//be [0x57573b]

/usr/local/cuda/open64/lib//be [0x576269]

/usr/local/cuda/open64/lib//be [0x57688c]

/usr/local/cuda/open64/lib//be [0x552722]

/usr/local/cuda/open64/lib//be [0x405443]

/usr/local/cuda/open64/lib//be [0x4061f1]

/usr/local/cuda/open64/lib//be [0x40752d]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x35b661d974]

/usr/local/cuda/open64/lib//be [0x4038da]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

Let me know,

Thanks

shreddn · October 14, 2010, 5:30pm

I’m now using a machine which has been upgraded to CUDA 2.1, and another file is now making nvcc segfault. I’m trying to identify the precise problem routine, but the error is:

[font=“Courier New”]/usr/local/cuda/bin/nvcc -D_DEBUG -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK_2.10//common//…/common/inc -DUNIX -g -I/data/user/project/src -I/data/user/project/include -I/usr/local/include/wcslib -I/usr/local/ATLAS/include -I/data/user2/Healpix_2.10/include_GFOR -I/data/user/project/src/correlator -o obj/debug/ftgm.cu.o -c …/src/ftgm.cu

Signal: Segmentation fault in Global Optimization – New PRE: Expr hoisting. phase.

(0): Error: Signal Segmentation fault in phase Global Optimization – New PRE: Expr hoisting. – processing aborted

*** Internal stack backtrace:
/usr/local/cuda/open64/lib//be [0x6ad582]

/usr/local/cuda/open64/lib//be [0x6ae265]

/usr/local/cuda/open64/lib//be [0x6ad93f]

/usr/local/cuda/open64/lib//be [0x6aec18]

/lib64/libc.so.6 [0x3c548301b0]

/usr/local/cuda/open64/lib//be [0x527769]

/usr/local/cuda/open64/lib//be [0x5280f5]

/usr/local/cuda/open64/lib//be [0x4dc3b8]

/usr/local/cuda/open64/lib//be [0x4dc64b]

/usr/local/cuda/open64/lib//be [0x4347a2]

/usr/local/cuda/open64/lib//be [0x48df8a]

/usr/local/cuda/open64/lib//be [0x418682]

/usr/local/cuda/open64/lib//be [0x419451]

/usr/local/cuda/open64/lib//be [0x41a630]

/usr/local/cuda/open64/lib//be [0x41bbd8]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x3c5481d8b4]

/usr/local/cuda/open64/lib//be [0x417dda]
nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

[/font]

For completeness:

[font=“Courier New”]$ /usr/local/cuda/bin/nvcc --version

nvcc: NVIDIA Â® Cuda compiler driver

Copyright Â© 2005-2007 NVIDIA Corporation

Built on Wed_Dec__3_16:25:17_PST_2008

Cuda compilation tools, release 2.1, V0.2.1221

[/font]

This compiled without issue under CUDA 2.0. The file in question is large, and contains several kernels and device functions - I’m going to try isolating the problem routine in a moment… I may be some time. However, suggestions as to what may be causing this error are welcome <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />

Did you ever figure this issue out? My program was working fine under numerous executions, then I simply added

[codebox]include <signal.h>

…

(void) signal(SIGINT, ex_program);[/codebox]

Then I recieved the same error you have.

Signal: Floating point exception in Code_Expansion phase.

(0): Error: Signal Floating point exception in phase Code_Expansion – processing aborted

*** Internal stack backtrace:

/usr/local/cuda/open64/lib//be [0x6d1c6f]

/usr/local/cuda/open64/lib//be [0x6d28b9]

/usr/local/cuda/open64/lib//be [0x6d200d]

/usr/local/cuda/open64/lib//be [0x6d3256]

/lib64/libc.so.6 [0x35b6630280]

/usr/local/cuda/open64/lib//be [0x592941]

/usr/local/cuda/open64/lib//be [0x592f74]

/usr/local/cuda/open64/lib//be [0x5932ef]

/usr/local/cuda/open64/lib//be [0x565d6e]

/usr/local/cuda/open64/lib//be [0x57085e]

/usr/local/cuda/open64/lib//be [0x571fb1]

/usr/local/cuda/open64/lib//be [0x570b28]

/usr/local/cuda/open64/lib//be [0x57573b]

/usr/local/cuda/open64/lib//be [0x576269]

/usr/local/cuda/open64/lib//be [0x57688c]

/usr/local/cuda/open64/lib//be [0x552722]

/usr/local/cuda/open64/lib//be [0x405443]

/usr/local/cuda/open64/lib//be [0x4061f1]

/usr/local/cuda/open64/lib//be [0x40752d]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x35b661d974]

/usr/local/cuda/open64/lib//be [0x4038da]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

Let me know,

Thanks

Topic		Replies	Views
nvcc Segmentation Fault (addressing issue with stack vars?) local to global gather CUDA Programming and Performance	8	9951	August 15, 2010
Segfault in nvcc Trouble with CUDA 2.1 CUDA Programming and Performance	0	1473	February 5, 2009
CUDA compile trouble CUDA Programming and Performance	47	5099	November 8, 2010
<input>(0): Error: Signal Segmentation fault in phase Global Optimization -- processing aborte CUDA Programming and Performance	10	3672	July 2, 2009
Nv-nsight-cu-cli segfault Nsight Compute cuda	6	1733	May 5, 2020
NVCC Segfault on boost::format in Host side code in .cu file CUDA Programming and Performance	8	2260	February 8, 2011
NVCC forces c++ compilation of .cu files CUDA Programming and Performance	11	25421	December 11, 2011
Simple CUDA Wizard for Visual Studio 2005 CUDA Programming and Performance	100	174507	April 8, 2012
gcc 4.4 support anytime soon? CUDA Programming and Performance	24	108097	April 9, 2010
Weird error when compiling CUDA Programming and Performance	3	1261	August 2, 2011

nvcc Segfault

Related topics