I’m trying to convert a C code to CUDA, and my conversion is causing nvcc to segfault. The offending function is as follows:
[codebox]global void RotateAccumulateKernel( const cuFloatComplex *dJinvJ0df,
const cuFloatComplex *dJinvJ0dt,
const float df, const float dt_sec,
const int *st1, const int *st2,
const float sha0, const float sdec0,
const float cha0, const float cdec0,
const float l, const float m, const float n,
const float decorrelationConst,
const float decorrelationTemp,
const float SourceFactor,
const float *X, const float *Y, const float *Z,
const int nStations, const int nTimes,
const int nBaselines,
const cuFloatComplex *cv_visdata, const float *cv_invvar,
cuFloatComplex *bv_visdata ) {
float B_x, B_y, B_z;
float u, v, w;
float FringeRateFactor;
cuFloatComplex PhaseShift;
cuFloatComplex VIS[MAX_POLS], tmp1[MAX_POLS], tmp2[MAX_POLS];
cuFloatComplex tmp3[MAX_POLS], inv[MAX_POLS];
int iBaseline = threadIdx.x + (blockDim.x*blockIdx.x);
const int iTime = 0;
if( iBaseline >= nBaselines ) {
return;
}
int iStation1 = st1[iBaseline];
int iStation2 = st2[iBaseline];
B_x = X[iStation1]-X[iStation2];
B_y = Y[iStation1]-Y[iStation2];
B_z = Z[iStation1]-Z[iStation2];
HADec2uvwGPU( sha0, sdec0, cha0, cdec0, B_x, B_y, B_z, u, v, w);
if ( u == 0 ) {
FringeRateFactor = 1.0;
} else {
FringeRateFactor = ( ( decorrelationConst * u ) / sinf( decorrelationConst * u ) );
FringeRateFactor /= ( ( decorrelationTemp * u ) / sinf( decorrelationTemp * u ) );
}
float temp = C2R_SIGND2PI( ul + vm + w*n );
PhaseShift = make_cuFloatComplex( cosf(temp), sinf(temp) ) * FringeRateFactor * SourceFactor;
for( int i=0; i<MAX_POLS; i++ ) {
VIS[i] = cv_visdata[iBaseline + (iTime*nBaselines) + (i*nTimes*nBaselines)];
VIS[i] *= cv_invvar[iBaseline + (iTime*nBaselines) + (i*nTimes*nBaselines)];
VIS[i] *= PhaseShift;
}
tmp1[0] = 1.0f + dJinvJ0df[0+(iStation1*MAX_POLS)]df + dJinvJ0dt[0+(iStation1MAX_POLS)]*dt_sec;
tmp1[1] = 0.0f + dJinvJ0df[1+(iStation1*MAX_POLS)]df + dJinvJ0dt[1+(iStation1MAX_POLS)]*dt_sec;
tmp1[2] = 0.0f + dJinvJ0df[2+(iStation1*MAX_POLS)]df + dJinvJ0dt[2+(iStation1MAX_POLS)]*dt_sec;
tmp1[3] = 1.0f + dJinvJ0df[3+(iStation1*MAX_POLS)]df + dJinvJ0dt[3+(iStation1MAX_POLS)]*dt_sec;
tmp2[0] = 1.0f + dJinvJ0df[0+(iStation2*MAX_POLS)]df + dJinvJ0dt[0+(iStation2MAX_POLS)]*dt_sec;
tmp2[1] = 0.0f + dJinvJ0df[1+(iStation2*MAX_POLS)]df + dJinvJ0dt[1+(iStation2MAX_POLS)]*dt_sec;
tmp2[2] = 0.0f + dJinvJ0df[2+(iStation2*MAX_POLS)]df + dJinvJ0dt[2+(iStation2MAX_POLS)]*dt_sec;
tmp2[3] = 1.0f + dJinvJ0df[3+(iStation2*MAX_POLS)]df + dJinvJ0dt[3+(iStation2MAX_POLS)]*dt_sec;
for( int i=0; i<MAX_POLS; i++ ) {
tmp2[i] = cuConjf( tmp2[i] );
}
invert2x2( tmp1, inv );
tmp1[0] = inv[0]*VIS[pp] + inv[1]*VIS[qp];
tmp1[1] = inv[0]*VIS[pq] + inv[1]*VIS[qq];
tmp1[2] = inv[2]*VIS[pp] + inv[3]*VIS[qp];
tmp1[3] = inv[2]*VIS[pq] + inv[3]*VIS[qq];
invert2x2( tmp2, inv );
cgemm2x2<NoTrans,NoTrans>( 1, tmp1, inv, tmp3 );
bv_visdata[iBaseline + (iTimenBaselines) + (0nTimes*nBaselines)] += tmp3[0];
bv_visdata[iBaseline + (iTimenBaselines) + (2nTimes*nBaselines)] += tmp3[1];
bv_visdata[iBaseline + (iTimenBaselines) + (3nTimes*nBaselines)] += tmp3[2];
bv_visdata[iBaseline + (iTimenBaselines) + (1nTimes*nBaselines)] += tmp3[3];
}[/codebox]
The value of [font=“Courier New”]MAX_POLS[/font] is 4, and the cgemm2x2 functions are working fine in other kernels, as is [font=“Courier New”]HADec2uvwGPU[/font]. When I try compiling it, the compiler complains:
[font=“Courier New”]/usr/local/cuda/bin/nvcc --compiler-options -fno-strict-aliasing -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK//common//…/common/inc -DUNIX -O3 -I/home/user/prog/src -I/home/user/prog/include -I/usr/local/include/wcslib -I/usr/include/atlas/ -I/usr/include/cfitsio/ -I/home/user/HPlib/include/ -o obj/release/myfile.cu_o -c …/src/myfile.cu
Signal: Segmentation fault in Global Optimization – LPRE: Var phi placement phase.
(0): Error: Signal Segmentation fault in phase Global Optimization – LPRE: Var phi placement – processing aborted
*** Internal stack backtrace:
/usr/local/cuda/open64/lib//be [0x69d232]
/usr/local/cuda/open64/lib//be [0x69df15]
/usr/local/cuda/open64/lib//be [0x69d5ef]
/usr/local/cuda/open64/lib//be [0x69e8c8]
/lib64/libc.so.6 [0x31d8430f30]
/usr/local/cuda/open64/lib//be [0x4dcd9a]
/usr/local/cuda/open64/lib//be [0x52280b]
/usr/local/cuda/open64/lib//be [0x5231d8]
/usr/local/cuda/open64/lib//be [0x50f2ec]
/usr/local/cuda/open64/lib//be [0x50f948]
/usr/local/cuda/open64/lib//be [0x434888]
/usr/local/cuda/open64/lib//be [0x4889da]
/usr/local/cuda/open64/lib//be [0x4181aa]
/usr/local/cuda/open64/lib//be [0x418f81]
/usr/local/cuda/open64/lib//be [0x41a160]
/usr/local/cuda/open64/lib//be [0x41b708]
/lib64/libc.so.6(__libc_start_main+0xf4) [0x31d841e074]
/usr/local/cuda/open64/lib//be [0x41791a]
nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4
[/font]
Now, I can see that this rather large kernel could give the compiler some grief, but a segfault isn’t a very helpful error message <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />
I have found that if I delete the final four lines which store the results, then the code compiles. Also, if I delete some of the other portions of the routine, it will compile. Is this kernel simply too big? This code is running on CUDA 2.0:
[font=“Courier New”]$ nvcc --version
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2007 NVIDIA Corporation
Built on Thu_Jun_19_03:38:28_PDT_2008
Cuda compilation tools, release 2.0, V0.2.1221[/font]
Any suggestions would be gratefully received.