Segfault in nvcc Trouble with CUDA 2.1

A machine around here has recently been upgrade to CUDA 2.1, and nvcc now segfaults when attempting to compile some (previously working) code. A minimal file to reproduce this is:

[codebox]#define CU_USE_NATIVE_COMPLEX

#include </usr/include/complex.h>

#include <cuComplex.h>

#include <cutil.h>

#include “cudacomplex.h”

#include “cudablas2x2.h”

#define NPOLS 4

#define Segfault_BLOCKSIZE 128

// ==========================================================

constant cuFloatComplex dc_rot[NPOLS];

// ==========================================================

global void SegfaultKernel( const int iCalSource,

			const cuFloatComplex *TileGainMatrices,

			const cuFloatComplex *Jn,

			const cuFloatComplex *Jd,

			const cuFloatComplex *wgt,

			const float invFilterFactor,

			cuFloatComplex *NewGainMatrices,

			const int nStations ) {

cuFloatComplex myJn[NPOLS], myJd[NPOLS];

cuFloatComplex matNorm, tmpmat[NPOLS], tmpmat2[NPOLS];

// Compute which station this thread will do

int iStation = threadIdx.x + (blockDim.x*blockIdx.x);

// Check if we’re in range

if( iStation >= nStations ) {

return;

}

cgemm2x2<NoTrans, NoTrans>( 1, myJn, tmpmat, tmpmat2 );

cgemm2x2<NoTrans, NoTrans>( 1, tmpmat2, dc_rot,

		      &(NewGainMatrices[(iCalSource*nStations*NPOLS)+(iStation*

NPOLS)]) );

}

// ==========================================================

void Segfaulter( const int iCalSource,

	 const float complex *d_TileGainMatrices,

	 const int nStations,

	 float complex *d_NewGainMatrices ) {

cuFloatComplex *d_Jn, *d_Jd;

cuFloatComplex *d_wgtAll, *d_wgtSum;

dim3 grid, threads;

float invFilterFactor = 0;

threads.x = Segfault_BLOCKSIZE;

threads.y = threads.z = 1;

grid.x = (int)ceil( ((float)nStations) / ( (float)threads.x ) );

grid.y = grid.z = 1;

SegfaultKernel<<<grid,threads>>>( iCalSource,

			    (const cuFloatComplex*)d_TileGainMatrices,

			    d_Jn,

			    d_Jd,

			    d_wgtSum,

			    invFilterFactor,

			    (cuFloatComplex*)d_NewGainMatrices,

			    nStations );

}

[/codebox]

The two header files references are attached to this post. According to [font=“Courier New”]/usr/local/cuda/bin/nvcc --version[/font] I have [font=“Courier New”]Cuda compilation tools, release 2.1, V0.2.1221[/font]. My platform (according to [font=“Courier New”]uname -a[/font]) is

[font=“Courier New”]Linux privatehost.privatedomain 2.6.18-92.1.22.el5 #1 SMP Tue Dec 16 11:57:43 EST 2008 x86_64 x86_64 x86_64 GNU/Linux[/font].

My actual compile command is as follows:

[font=“Courier New”]/usr/local/cuda/bin/nvcc --compiler-options -fno-strict-aliasing -I. -I/usr/local/cuda/include -I/usr/local/NVIDIA_CUDA_SDK_2.10//common//…/common/inc -DUNIX -O3 -o ftgm.cu.o -c ./ftgm.cu[/font]

After a few warnings about uninitialised variables (appearing because I’ve excised code which doesn’t segfault), the compiler segfaults with

[font=“Courier New”]Signal: Segmentation fault in Global Optimization – New PRE: Expr hoisting. phase.

(0): Error: Signal Segmentation fault in phase Global Optimization – New PRE: Expr hoisting. – processing aborted

*** Internal stack backtrace:

/usr/local/cuda/open64/lib//be [0x6ad582]

/usr/local/cuda/open64/lib//be [0x6ae265]

/usr/local/cuda/open64/lib//be [0x6ad93f]

/usr/local/cuda/open64/lib//be [0x6aec18]

/lib64/libc.so.6 [0x3c548301b0]

/usr/local/cuda/open64/lib//be [0x527769]

/usr/local/cuda/open64/lib//be [0x5280f5]

/usr/local/cuda/open64/lib//be [0x4dc3b8]

/usr/local/cuda/open64/lib//be [0x4dc64b]

/usr/local/cuda/open64/lib//be [0x4347a2]

/usr/local/cuda/open64/lib//be [0x48df8a]

/usr/local/cuda/open64/lib//be [0x418682]

/usr/local/cuda/open64/lib//be [0x419451]

/usr/local/cuda/open64/lib//be [0x41a630]

/usr/local/cuda/open64/lib//be [0x41bbd8]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x3c5481d8b4]

/usr/local/cuda/open64/lib//be [0x417dda]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4[/font]

The problem appears centred around the cgemm2x2 calls. If either is commented out as well, then the code compiles.

Is this a known problem? Are there any suggested work-arounds, short of going back to CUDA 2.0?
cudablas2x2.h (4.13 KB)
cudacomplex.h (2.3 KB)