Why I get Compiler Error ... during Register Allocation phase

Hello,

I have a little code to calculate some price of Options; the BSAmericanApprox2002CD_kernel.cuh file has 297 lines and main_BDCuda.cu file has 182 lines, but get the error

make
/tmp/tmpxft_0000710d_00000000-7_main_BDCuda.cpp3.i(0): Warning: Olimit was exceeded on function _Z23BSAmericanApprox2002GPUPfS_S_S_S_S_S_S_i; will not perform function-scope optimization.
To still perform function-scope optimization, use -OPT:Olimit=0 (no limit) or -OPT:Olimit=118245

Assertion failure at line 2498 of …/…/be/cg/NVISA/cgtarget.cxx:

Compiler Error in file /tmp/tmpxft_0000710d_00000000-7_main_BDCuda.cpp3.i during Register Allocation phase:

ran out of registers in float

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be returned non-zero status 1
make: *** [obj/Release/main_BDCuda.cu.o] Error 255

here the two files in attachement

Please help.

Thank You.

Ciprian

#ifndef BSAMERICANAPPROX2002CD_KERNEL_CUH_
#define BSAMERICANAPPROX2002CD_KERNEL_CUH_

device inline float ArcSin(float X)
{
if (fabs(X)== 1)
{

	return ( (X<0) ? -1 : 1 ) * PI / ftwo;
}
else
	return atan(X / sqrt(fone - X *X));

}

device inline float CND(RegTemp* d_regData, float x )
{
d_regData->y = fabs(x);
if( d_regData->y > 37) return 0;
else
{
d_regData->Exponential = expf(-d_regData->y *d_regData->y / CND_c00);

	if (d_regData->y < CND_c01 )
	{
		d_regData->SumA = CND_c02;
		d_regData->SumA = d_regData->SumA * d_regData->y + CND_c03;
		d_regData->SumA = d_regData->SumA * d_regData->y + CND_c04;
		d_regData->SumA = d_regData->SumA * d_regData->y + CND_c05;
		d_regData->SumA = d_regData->SumA * d_regData->y + CND_c06;
		d_regData->SumA = d_regData->SumA * d_regData->y + CND_c07;
		d_regData->SumA = d_regData->SumA * d_regData->y + CND_c08;

		d_regData->SumB = CND_c09;
		d_regData->SumB = d_regData->SumB * d_regData->y + CND_c10;
		d_regData->SumB = d_regData->SumB * d_regData->y + CND_c11;
		d_regData->SumB = d_regData->SumB * d_regData->y + CND_c12;
		d_regData->SumB = d_regData->SumB * d_regData->y + CND_c13;
		d_regData->SumB = d_regData->SumB * d_regData->y + CND_c14;
		d_regData->SumB = d_regData->SumB * d_regData->y + CND_c15;
		d_regData->SumB = d_regData->SumB * d_regData->y + CND_c16;

		d_regData->cnd= d_regData->Exponential * d_regData->SumA / d_regData->SumB;
	}
	else
	{
		d_regData->SumA = d_regData->y + CND_c17;
		d_regData->SumA = d_regData->y + CND_c18 / d_regData->SumA;
		d_regData->SumA = d_regData->y + CND_c19 / d_regData->SumA;
		d_regData->SumA = d_regData->y + CND_c20 / d_regData->SumA;
		d_regData->SumA = d_regData->y + CND_c21 / d_regData->SumA;
		d_regData->cnd=d_regData->Exponential / (d_regData->SumA * CND_c22);
	}
}

if (x > 0) return d_regData->cnd =CND_c23 - d_regData->cnd;

return d_regData->cnd;

}

device inline float calcBSCallPrice (RegTemp* d_regData, float& price,float& strike, float& T,float& rate, float& b,float& vol )
{
d_regData->d1 = (logf(price/strike) + (b + (vol)*(vol)/2 )T)/(volsqrtf(T));

return price*expf((b-rate)*T)*CND(d_regData, d_regData->d1) - strike*expf(-rate*T)* CND(d_regData, d_regData->d1 - vol*sqrtf(T));

}

// The cumulative bivariate normal distribution function
device inline float CBND(RegTemp* d_regData, float X , float y ,float rho)
{
d_regData->BVN = 0;

d_regData->NG = 1;
d_regData->LG = 3;
if (fabs(rho) < 0.3)
{
	d_regData->NG = 1;
	d_regData->LG = 3;
}
else if( fabs(rho) < 0.75)
{
	d_regData->NG = 2;
	d_regData->LG = 6;
}
else
{
	d_regData->NG = 3;
	d_regData->LG = 10;
}

d_regData->h = -X;
d_regData->k = -y;
d_regData->hk = d_regData->h * d_regData->k;

if(fabs(rho) < 0.925 )
{
	if( fabs(rho) > 0 )
	{
		d_regData->hs = (d_regData->h * d_regData->h + d_regData->k * d_regData->k) / ftwo;
		d_regData->asr = ArcSin(rho);
		for (d_regData->i = 1;d_regData->i<= d_regData->LG;d_regData->i++)
			for( d_regData->ISs = -1;d_regData->ISs<=1;d_regData->ISs=d_regData->ISs+2)
			{
				d_regData->sn = sin(d_regData->asr * (d_regData->ISs * __XX[d_regData->i][ d_regData->NG] + 1) / ftwo);
				d_regData->BVN = d_regData->BVN + __W[d_regData->i][ d_regData->NG] * expf((d_regData->sn * d_regData->hk - d_regData->hs) / (1 - d_regData->sn * d_regData->sn));
			}//Next ISs
			//Next i

		d_regData->BVN = d_regData->BVN * d_regData->asr / (4.0 * PI);
	}
	d_regData->BVN = d_regData->BVN + CND(d_regData, -d_regData->h) * CND(d_regData, -d_regData->k);
}
else
{
	if( rho < 0 )
	{
		d_regData->k = -d_regData->k;
		d_regData->hk = -d_regData->hk;
	}

	if( fabs(rho) < 1)
	{
		d_regData->Ass = (fone - rho) * (fone + rho);
		d_regData->A = sqrtf(d_regData->Ass);
		d_regData->bs = powf( (d_regData->h - d_regData->k) , 2);
		d_regData->c = (4.0 - d_regData->hk) / 8.0;
		d_regData->d = (12.0 - d_regData->hk) / 16.0;
		d_regData->asr = -(d_regData->bs / d_regData->Ass + d_regData->hk) / ftwo;
		//} -1
		if( d_regData->asr > -100 ) d_regData->BVN = d_regData->A * expf(d_regData->asr) * (fone - d_regData->c * (d_regData->bs - d_regData->Ass) * (fone - d_regData->d * d_regData->bs / ffive) / 3.0 + d_regData->c * d_regData->d * d_regData->Ass * d_regData->Ass / ffive);
		if( -d_regData->hk < 100 )
		{
			d_regData->b = sqrtf(d_regData->bs);
			d_regData->BVN = d_regData->BVN - expf(-d_regData->hk / ftwo) * sqrtf(ftwo * PI) * CND(d_regData, -d_regData->b / d_regData->A) * d_regData->b * (fone - d_regData->c * d_regData->bs * (fone - d_regData->d * d_regData->bs / ffive) / 3.0);
		}
		d_regData->A = d_regData->A / ftwo;
		for(d_regData->i = 1;d_regData->i<= d_regData->LG;d_regData->i++)   //for(i = 1;i< LG
		{
			for (d_regData->ISs = -1;d_regData->ISs<=1 ;d_regData->ISs+=2)
			{
				d_regData->xs =powf( (d_regData->A * (d_regData->ISs * __XX[d_regData->i][d_regData->NG] + 1)) , 2);
				d_regData->rs = sqrtf(fone - d_regData->xs);
				d_regData->asr = -(d_regData->bs / d_regData->xs + d_regData->hk) / ftwo;
				if( d_regData->asr > -100)
					d_regData->BVN = d_regData->BVN + d_regData->A * __W[d_regData->i][ d_regData->NG] * expf(d_regData->asr) * (expf(-d_regData->hk * (fone - d_regData->rs) / (ftwo * (fone + d_regData->rs))) / d_regData->rs
					- (fone + d_regData->c * d_regData->xs * (fone + d_regData->d * d_regData->xs)));
			}
		}
		d_regData->BVN = -d_regData->BVN / (ftwo * PI);
	}//0
	if( rho > 0 )
	{
		//float max;
		//if(h>k) max=h;
		//else max=k;

		d_regData->BVN = d_regData->BVN + CND(d_regData, - ( (d_regData->h>d_regData->k) ? d_regData->h : d_regData->k) );
	}
	else
	{
		d_regData->BVN = -d_regData->BVN;

		if( d_regData->k > d_regData->h) d_regData->BVN = d_regData->BVN + CND(d_regData, d_regData->k) - CND(d_regData, d_regData->h);
	}
	//End if(
	//End if(
}

return d_regData->BVN;

}

device inline float phi(RegTemp* d_regData, float S, float T, float gamma, float h, float i, float r, float b, float v)
{
//lambda = (-r + gamma * b + 0.5 * gamma * (gamma - 1.0) * (v *v)) * T;
d_regData->d = -(logf(S / h) + (b + (gamma - 0.5) * (v *v)) * T) / (v * sqrtf(T));
//kappa = 2.0 * b /(v *v) + (2.0 * gamma - 1);

return (	exp((-r + gamma * b + 0.5 * gamma * (gamma - 1.0) * (v *v)) * T) *
			powf(S, gamma) *
			(	CND(d_regData, d_regData->d) - powf((i / S), 2.0 * b /(v *v) + (2.0 * gamma - 1))*
				CND(d_regData, d_regData->d - 2.0 * logf(i / S) / (v * sqrtf(T)))
			)
		);

}

device inline float ksi(RegTemp* d_regData, float S ,float T2 ,float gamma , float h ,float I2 ,float I1 ,float t1 ,float r ,float b ,float v )
{
d_regData->rho = sqrtf(t1 / T2);
//lambda = -r + gamma * b + fzpfive * gamma * (gamma - fone) * ( v*v);
d_regData->kappa = ftwo * b / (v *v) + (ftwo * gamma - fone);

return
	expf(-r + gamma * b + fzpfive * gamma * (gamma - fone) * ( v*v) * T2) *
	powf(S , gamma) *
	(	CBND(d_regData, 	-((log(S / I1) + (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
				-(log(S / h) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
				d_regData->rho) -
		powf( (I2 / S) , d_regData->kappa) *
			CBND(d_regData, 	-((log((I2 *I2) / (S * I1)) + (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
					-(log((I2 * I2) / (S * h)) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
					d_regData->rho) -
		powf( (I1 / S) , d_regData->kappa) *
			CBND(d_regData, 	-((log(S / I1) - (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
					-(log((I1 *I1) / (S * h)) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
					-d_regData->rho) +
        powf((I1 / I2) , d_regData->kappa) *
			CBND(d_regData, 	-((log((I2 *I2) / (S * I1)) - (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
					-(log(S * (I1 *I1 ) / (h * (I2 *I2))) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
					-d_regData->rho)
    );

}

device inline float BSAmericanCallApprox2002PC(RegTemp* d_regData, float S,float X ,float T ,float r,float b , float v )
{
d_regData->t1 = fone / ftwo * (sqrtf(ffive) - 1) * T;

if( b >= r )   // Never optimal to exersice before maturity
        return calcBSCallPrice(d_regData,  S, X, T, r, b, v);
else
{
    d_regData->Beta = (fone / ftwo - b / (v *v)) + sqrtf( powf((b /(v *v) - fone / ftwo), 2) + ftwo * r / (v *v));
    d_regData->BInfinity = d_regData->Beta / (d_regData->Beta - 1) * X;

    d_regData->m1=X;
    d_regData->m2=r/(r-B)*X;
    if(d_regData->m1>d_regData->m2) d_regData->B0=d_regData->m1;  //d_regData->B0 = max(X, r / (r - B) * X);
    else d_regData->B0=d_regData->m2;

    d_regData->ht1 = -(b * d_regData->t1 + ftwo * v * sqrtf(d_regData->t1)) * (X *X) / ((d_regData->BInfinity - d_regData->B0) * d_regData->B0);
    d_regData->ht2 = -(b * T + ftwo * v * sqrtf(T)) * (X *X) / ((d_regData->BInfinity - d_regData->B0) * d_regData->B0);
    d_regData->I1 = d_regData->B0 + (d_regData->BInfinity - d_regData->B0) * (fone - expf(d_regData->ht1));
    d_regData->I2 = d_regData->B0 + (d_regData->BInfinity - d_regData->B0) * (fone - expf(d_regData->ht2));
    d_regData->alfa1 = (d_regData->I1 - X) * powf( d_regData->I1 , (-d_regData->Beta));
    d_regData->alfa2 = (d_regData->I2 - X) * powf(d_regData->I2 , (-d_regData->Beta));

    if(S >= d_regData->I2)
        return S - X;
    else
    {
		return ( d_regData->alfa2 * powf(S , d_regData->Beta) -
				d_regData->alfa2 * phi(d_regData, S, d_regData->t1, d_regData->Beta, d_regData->I2, d_regData->I2, r, b, v) +
						phi(d_regData, S, d_regData->t1, 1, d_regData->I2, d_regData->I2, r, b, v) -
						phi(d_regData, S, d_regData->t1, 1, d_regData->I1, d_regData->I2, r, b, v) -
				X * 	phi(d_regData, S, d_regData->t1, 0, d_regData->I2, d_regData->I2, r, b, v) +
				X * 	phi(d_regData, S, d_regData->t1, 0, d_regData->I1, d_regData->I2, r, b, v) +
				d_regData->alfa1 * phi(d_regData, S, d_regData->t1, d_regData->Beta, d_regData->I1, d_regData->I2, r, b, v) -
				d_regData->alfa1 * ksi(d_regData, S, T, d_regData->Beta, d_regData->I1, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) +
						ksi(d_regData, S, T, 1, d_regData->I1, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) -
						ksi(d_regData, S, T, 1, X, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) -
				X * 	ksi(d_regData, S, T, 0, d_regData->I1, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) +
				X * 	ksi(d_regData, S, T, 0, X, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v)  );
	}
}

}

// The cumulative bivariate normal distribution function
device inline void BSAmericanApprox2002(RegTemp* d_regData, float& call,float& put, float& S , float& X , float& T ,float& r ,float& b ,float& v )
{
call = BSAmericanCallApprox2002PC(d_regData, S, X, T, r, b, v);

// Use the Bjerksund and Stensland put-call transformation
put = BSAmericanCallApprox2002PC(d_regData, X, S, T, r-b, -b, v);

}

////////////////////////////////////////////////////////////////////////////////
//Process an array of optN options on GPU
////////////////////////////////////////////////////////////////////////////////
global void BSAmericanApprox2002GPU(float *d_CallResult, float *d_PutResult,
float *d_StockPrice, float *d_OptionStrike, float *d_OptionYears,
float *d_Riskfree, float *d_b, float *d_OptionVols, int optN)
{
//Thread index
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
//Total number of threads in execution grid
const int THREAD_N = blockDim.x * gridDim.x;

//No matter how small is execution grid or how large OptN is,
//exactly OptN indices will be processed with perfect memory coalescing
for(int opt = tid; opt < optN; opt += THREAD_N)
{
	RegTemp data = d_regData[tid];

	BSAmericanApprox2002(
		&data,
		d_CallResult[opt],
		d_PutResult[opt],
		d_StockPrice[opt],
		d_OptionStrike[opt],
		d_OptionYears[opt],
		d_Riskfree[opt],
		d_b[opt],
		d_OptionVols[opt]
	);
}

}

#endif /* BSAMERICANAPPROX2002CD_KERNEL_CUH_ */

#include <cutil_inline.h>
#include

device constant float __XX[11][4] = {
{ 0,0,0,0 },
{ 0,-0.932469514203152,-0.981560634246719,-0.993128599185095 },
{ 0,-0.661209386466265,-0.904117256370475,-0.963971927277914 },
{ 0,-0.238619186083197,-0.769902674194305,-0.912234428251326 },
{ 0,0,-0.587317954286617,-0.839116971822219 },
{ 0,0,-0.36783149899818,-0.746331906460151 },
{ 0,0,-0.125233408511469,-0.636053680726515 },
{ 0,0,0,-0.510867001950827 },
{ 0,0,0,-0.37370608871542 },
{ 0,0,0,-0.227785851141645 },
{ 0,0,0,-0.0765265211334973 }};

device constant float __W[11][4] = {
{ 0,0,0,0 },
{ 0,0.17132449237917,0.0471753363865118,0.0176140071391521 },
{ 0,0.360761573048138,0.106939325995318,0.0406014298003869 },
{ 0,0.46791393457269,0.160078328543346,0.0626720483341091 },
{ 0,0,0.203167426723066,0.0832767415767048 },
{ 0,0,0.233492536538355,0.10193011981724 },
{ 0,0,0.249147045813403,0.118194531961518 },
{ 0,0,0,0.131688638449177 },
{ 0,0,0,0.142096109318382 },
{ 0,0,0,0.149172986472604 },
{ 0,0,0,0.152753387130726 }};

device constant float fone = 1.0;
device constant float ftwo = 2.0;
device constant float ffive = 5.0;
device constant float fzpfive = 0.5;

device constant float CND_c00 = 2.0;

device constant float CND_c01 = 7.07106781186547;
device constant float CND_c02 = 0.0352624965998911;
device constant float CND_c03 = 0.700383064443688;
device constant float CND_c04 = 6.37396220353165;
device constant float CND_c05 = 33.912866078383;
device constant float CND_c06 = 112.079291497871;
device constant float CND_c07 = 221.213596169931;
device constant float CND_c08 = 220.206867912376;

device constant float CND_c09 = 0.0883883476483184;
device constant float CND_c10 = 1.75566716318264;
device constant float CND_c11 = 16.064177579207;
device constant float CND_c12 = 86.7807322029461;
device constant float CND_c13 = 296.564248779674;
device constant float CND_c14 = 637.333633378831;
device constant float CND_c15 = 793.826512519948;
device constant float CND_c16 = 440.413735824752;

device constant float CND_c17 = 0.65;
device constant float CND_c18 = 4.0;
device constant float CND_c19 = 3.0;
device constant float CND_c20 = 2.0;
device constant float CND_c21 = 1.0;

device constant float CND_c22 = 2.506628274631;

device constant float CND_c23 = 1.0;

device constant const float PI = 3.141592653589793;

typedef struct align(16)
{
//CND
float y , Exponential , SumA, SumB ;
float cnd;

//calcBSCallPrice
float d1;

//CBND
int i , ISs ;
int  LG , NG ;
float h , k , hk , hs ,BVN ,Ass , asr , sn;
float A  ,b , bs , c , d ;
float xs , rs;
//ksi
float rho , kappa;

//phi
//float d;

//BSAmericanCallApprox2002PC
float BInfinity , B0 ;
float ht1 , ht2 , I1 , I2;
float alfa1 , alfa2 , Beta , t1 ;
float m1, m2;

} RegTemp;

device RegTemp d_regData[32*16];

#include “BSAmericanApprox2002CD_kernel.cuh”

extern “C”
void BSCD_Init(int algType, unsigned int len, float *S, float *X, float *T, float *r, float b, float v,
float
calls,float
puts)
{
int dimGrid = 32;
int dimBlock = 16;

int OPT_SZ = len * sizeof(float);

//'d_' prefix - GPU (device) memory space
float
    //Results calculated by GPU
    *d_CallResult,
    *d_PutResult,
    //GPU instance of input data
    *d_StockPrice,
    *d_OptionStrike,
    *d_OptionYears,
    *d_OptionRate,
    *d_OptionB,
    *d_OptionVols;

cudaSetDevice( cutGetMaxGflopsDeviceId() );

cutilSafeCall( cudaMalloc((void **)&d_CallResult,   OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_PutResult,    OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_StockPrice,   OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionStrike, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionYears,  OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionRate,  	OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionB,  	OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionVols, 	OPT_SZ) );

//Copy options data to GPU memory for further processing
cutilSafeCall( cudaMemcpy(d_StockPrice,  	S,	OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionStrike, 	X, 	OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionYears,  	T,	OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionRate,  	r,	OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionB,  		b,	OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionVols,  	v,	OPT_SZ, cudaMemcpyHostToDevice) );

cutilSafeCall( cudaThreadSynchronize() );
for(int i = 0; i < 1; i++)	//one iteration
{
		BSAmericanApprox2002GPU<<<dimGrid, dimBlock>>>(
			//d_regData,
			d_CallResult,
			d_PutResult,
			d_StockPrice,
			d_OptionStrike,
			d_OptionYears,
			d_OptionRate,
			d_OptionB,
			d_OptionVols,
			len
		);

}
cutilSafeCall( cudaThreadSynchronize() );

//Read back GPU results to compare them to CPU results
cutilSafeCall( cudaMemcpy(calls, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost) );
cutilSafeCall( cudaMemcpy(puts,  d_PutResult,  OPT_SZ, cudaMemcpyDeviceToHost) );

//Release memory
cutilSafeCall( cudaFree(d_PutResult)    );
cutilSafeCall( cudaFree(d_CallResult)   );
cutilSafeCall( cudaFree(d_StockPrice)  );
cutilSafeCall( cudaFree(d_OptionStrike) );
cutilSafeCall( cudaFree(d_OptionYears)  );
cutilSafeCall( cudaFree(d_OptionRate)  );
cutilSafeCall( cudaFree(d_OptionB)  );
cutilSafeCall( cudaFree(d_OptionVols)  );

cudaThreadExit();

}