Hello,
I have a little code to calculate some price of Options; the BSAmericanApprox2002CD_kernel.cuh file has 297 lines and main_BDCuda.cu file has 182 lines, but get the error
make
/tmp/tmpxft_0000710d_00000000-7_main_BDCuda.cpp3.i(0): Warning: Olimit was exceeded on function _Z23BSAmericanApprox2002GPUPfS_S_S_S_S_S_S_i; will not perform function-scope optimization.
To still perform function-scope optimization, use -OPT:Olimit=0 (no limit) or -OPT:Olimit=118245
Assertion failure at line 2498 of …/…/be/cg/NVISA/cgtarget.cxx:
Compiler Error in file /tmp/tmpxft_0000710d_00000000-7_main_BDCuda.cpp3.i during Register Allocation phase:
ran out of registers in float
nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be returned non-zero status 1
make: *** [obj/Release/main_BDCuda.cu.o] Error 255
here the two files in attachement
Please help.
Thank You.
Ciprian
#ifndef BSAMERICANAPPROX2002CD_KERNEL_CUH_
#define BSAMERICANAPPROX2002CD_KERNEL_CUH_
device inline float ArcSin(float X)
{
if (fabs(X)== 1)
{
return ( (X<0) ? -1 : 1 ) * PI / ftwo;
}
else
return atan(X / sqrt(fone - X *X));
}
device inline float CND(RegTemp* d_regData, float x )
{
d_regData->y = fabs(x);
if( d_regData->y > 37) return 0;
else
{
d_regData->Exponential = expf(-d_regData->y *d_regData->y / CND_c00);
if (d_regData->y < CND_c01 )
{
d_regData->SumA = CND_c02;
d_regData->SumA = d_regData->SumA * d_regData->y + CND_c03;
d_regData->SumA = d_regData->SumA * d_regData->y + CND_c04;
d_regData->SumA = d_regData->SumA * d_regData->y + CND_c05;
d_regData->SumA = d_regData->SumA * d_regData->y + CND_c06;
d_regData->SumA = d_regData->SumA * d_regData->y + CND_c07;
d_regData->SumA = d_regData->SumA * d_regData->y + CND_c08;
d_regData->SumB = CND_c09;
d_regData->SumB = d_regData->SumB * d_regData->y + CND_c10;
d_regData->SumB = d_regData->SumB * d_regData->y + CND_c11;
d_regData->SumB = d_regData->SumB * d_regData->y + CND_c12;
d_regData->SumB = d_regData->SumB * d_regData->y + CND_c13;
d_regData->SumB = d_regData->SumB * d_regData->y + CND_c14;
d_regData->SumB = d_regData->SumB * d_regData->y + CND_c15;
d_regData->SumB = d_regData->SumB * d_regData->y + CND_c16;
d_regData->cnd= d_regData->Exponential * d_regData->SumA / d_regData->SumB;
}
else
{
d_regData->SumA = d_regData->y + CND_c17;
d_regData->SumA = d_regData->y + CND_c18 / d_regData->SumA;
d_regData->SumA = d_regData->y + CND_c19 / d_regData->SumA;
d_regData->SumA = d_regData->y + CND_c20 / d_regData->SumA;
d_regData->SumA = d_regData->y + CND_c21 / d_regData->SumA;
d_regData->cnd=d_regData->Exponential / (d_regData->SumA * CND_c22);
}
}
if (x > 0) return d_regData->cnd =CND_c23 - d_regData->cnd;
return d_regData->cnd;
}
device inline float calcBSCallPrice (RegTemp* d_regData, float& price,float& strike, float& T,float& rate, float& b,float& vol )
{
d_regData->d1 = (logf(price/strike) + (b + (vol)*(vol)/2 )T)/(volsqrtf(T));
return price*expf((b-rate)*T)*CND(d_regData, d_regData->d1) - strike*expf(-rate*T)* CND(d_regData, d_regData->d1 - vol*sqrtf(T));
}
// The cumulative bivariate normal distribution function
device inline float CBND(RegTemp* d_regData, float X , float y ,float rho)
{
d_regData->BVN = 0;
d_regData->NG = 1;
d_regData->LG = 3;
if (fabs(rho) < 0.3)
{
d_regData->NG = 1;
d_regData->LG = 3;
}
else if( fabs(rho) < 0.75)
{
d_regData->NG = 2;
d_regData->LG = 6;
}
else
{
d_regData->NG = 3;
d_regData->LG = 10;
}
d_regData->h = -X;
d_regData->k = -y;
d_regData->hk = d_regData->h * d_regData->k;
if(fabs(rho) < 0.925 )
{
if( fabs(rho) > 0 )
{
d_regData->hs = (d_regData->h * d_regData->h + d_regData->k * d_regData->k) / ftwo;
d_regData->asr = ArcSin(rho);
for (d_regData->i = 1;d_regData->i<= d_regData->LG;d_regData->i++)
for( d_regData->ISs = -1;d_regData->ISs<=1;d_regData->ISs=d_regData->ISs+2)
{
d_regData->sn = sin(d_regData->asr * (d_regData->ISs * __XX[d_regData->i][ d_regData->NG] + 1) / ftwo);
d_regData->BVN = d_regData->BVN + __W[d_regData->i][ d_regData->NG] * expf((d_regData->sn * d_regData->hk - d_regData->hs) / (1 - d_regData->sn * d_regData->sn));
}//Next ISs
//Next i
d_regData->BVN = d_regData->BVN * d_regData->asr / (4.0 * PI);
}
d_regData->BVN = d_regData->BVN + CND(d_regData, -d_regData->h) * CND(d_regData, -d_regData->k);
}
else
{
if( rho < 0 )
{
d_regData->k = -d_regData->k;
d_regData->hk = -d_regData->hk;
}
if( fabs(rho) < 1)
{
d_regData->Ass = (fone - rho) * (fone + rho);
d_regData->A = sqrtf(d_regData->Ass);
d_regData->bs = powf( (d_regData->h - d_regData->k) , 2);
d_regData->c = (4.0 - d_regData->hk) / 8.0;
d_regData->d = (12.0 - d_regData->hk) / 16.0;
d_regData->asr = -(d_regData->bs / d_regData->Ass + d_regData->hk) / ftwo;
//} -1
if( d_regData->asr > -100 ) d_regData->BVN = d_regData->A * expf(d_regData->asr) * (fone - d_regData->c * (d_regData->bs - d_regData->Ass) * (fone - d_regData->d * d_regData->bs / ffive) / 3.0 + d_regData->c * d_regData->d * d_regData->Ass * d_regData->Ass / ffive);
if( -d_regData->hk < 100 )
{
d_regData->b = sqrtf(d_regData->bs);
d_regData->BVN = d_regData->BVN - expf(-d_regData->hk / ftwo) * sqrtf(ftwo * PI) * CND(d_regData, -d_regData->b / d_regData->A) * d_regData->b * (fone - d_regData->c * d_regData->bs * (fone - d_regData->d * d_regData->bs / ffive) / 3.0);
}
d_regData->A = d_regData->A / ftwo;
for(d_regData->i = 1;d_regData->i<= d_regData->LG;d_regData->i++) //for(i = 1;i< LG
{
for (d_regData->ISs = -1;d_regData->ISs<=1 ;d_regData->ISs+=2)
{
d_regData->xs =powf( (d_regData->A * (d_regData->ISs * __XX[d_regData->i][d_regData->NG] + 1)) , 2);
d_regData->rs = sqrtf(fone - d_regData->xs);
d_regData->asr = -(d_regData->bs / d_regData->xs + d_regData->hk) / ftwo;
if( d_regData->asr > -100)
d_regData->BVN = d_regData->BVN + d_regData->A * __W[d_regData->i][ d_regData->NG] * expf(d_regData->asr) * (expf(-d_regData->hk * (fone - d_regData->rs) / (ftwo * (fone + d_regData->rs))) / d_regData->rs
- (fone + d_regData->c * d_regData->xs * (fone + d_regData->d * d_regData->xs)));
}
}
d_regData->BVN = -d_regData->BVN / (ftwo * PI);
}//0
if( rho > 0 )
{
//float max;
//if(h>k) max=h;
//else max=k;
d_regData->BVN = d_regData->BVN + CND(d_regData, - ( (d_regData->h>d_regData->k) ? d_regData->h : d_regData->k) );
}
else
{
d_regData->BVN = -d_regData->BVN;
if( d_regData->k > d_regData->h) d_regData->BVN = d_regData->BVN + CND(d_regData, d_regData->k) - CND(d_regData, d_regData->h);
}
//End if(
//End if(
}
return d_regData->BVN;
}
device inline float phi(RegTemp* d_regData, float S, float T, float gamma, float h, float i, float r, float b, float v)
{
//lambda = (-r + gamma * b + 0.5 * gamma * (gamma - 1.0) * (v *v)) * T;
d_regData->d = -(logf(S / h) + (b + (gamma - 0.5) * (v *v)) * T) / (v * sqrtf(T));
//kappa = 2.0 * b /(v *v) + (2.0 * gamma - 1);
return ( exp((-r + gamma * b + 0.5 * gamma * (gamma - 1.0) * (v *v)) * T) *
powf(S, gamma) *
( CND(d_regData, d_regData->d) - powf((i / S), 2.0 * b /(v *v) + (2.0 * gamma - 1))*
CND(d_regData, d_regData->d - 2.0 * logf(i / S) / (v * sqrtf(T)))
)
);
}
device inline float ksi(RegTemp* d_regData, float S ,float T2 ,float gamma , float h ,float I2 ,float I1 ,float t1 ,float r ,float b ,float v )
{
d_regData->rho = sqrtf(t1 / T2);
//lambda = -r + gamma * b + fzpfive * gamma * (gamma - fone) * ( v*v);
d_regData->kappa = ftwo * b / (v *v) + (ftwo * gamma - fone);
return
expf(-r + gamma * b + fzpfive * gamma * (gamma - fone) * ( v*v) * T2) *
powf(S , gamma) *
( CBND(d_regData, -((log(S / I1) + (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
-(log(S / h) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
d_regData->rho) -
powf( (I2 / S) , d_regData->kappa) *
CBND(d_regData, -((log((I2 *I2) / (S * I1)) + (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
-(log((I2 * I2) / (S * h)) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
d_regData->rho) -
powf( (I1 / S) , d_regData->kappa) *
CBND(d_regData, -((log(S / I1) - (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
-(log((I1 *I1) / (S * h)) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
-d_regData->rho) +
powf((I1 / I2) , d_regData->kappa) *
CBND(d_regData, -((log((I2 *I2) / (S * I1)) - (b + (gamma - fzpfive) * (v *v)) * t1) / (v * sqrtf(t1))),
-(log(S * (I1 *I1 ) / (h * (I2 *I2))) + (b + (gamma - fzpfive) * (v *v)) * T2) / (v * sqrtf(T2)),
-d_regData->rho)
);
}
device inline float BSAmericanCallApprox2002PC(RegTemp* d_regData, float S,float X ,float T ,float r,float b , float v )
{
d_regData->t1 = fone / ftwo * (sqrtf(ffive) - 1) * T;
if( b >= r ) // Never optimal to exersice before maturity
return calcBSCallPrice(d_regData, S, X, T, r, b, v);
else
{
d_regData->Beta = (fone / ftwo - b / (v *v)) + sqrtf( powf((b /(v *v) - fone / ftwo), 2) + ftwo * r / (v *v));
d_regData->BInfinity = d_regData->Beta / (d_regData->Beta - 1) * X;
d_regData->m1=X;
d_regData->m2=r/(r-B)*X;
if(d_regData->m1>d_regData->m2) d_regData->B0=d_regData->m1; //d_regData->B0 = max(X, r / (r - B) * X);
else d_regData->B0=d_regData->m2;
d_regData->ht1 = -(b * d_regData->t1 + ftwo * v * sqrtf(d_regData->t1)) * (X *X) / ((d_regData->BInfinity - d_regData->B0) * d_regData->B0);
d_regData->ht2 = -(b * T + ftwo * v * sqrtf(T)) * (X *X) / ((d_regData->BInfinity - d_regData->B0) * d_regData->B0);
d_regData->I1 = d_regData->B0 + (d_regData->BInfinity - d_regData->B0) * (fone - expf(d_regData->ht1));
d_regData->I2 = d_regData->B0 + (d_regData->BInfinity - d_regData->B0) * (fone - expf(d_regData->ht2));
d_regData->alfa1 = (d_regData->I1 - X) * powf( d_regData->I1 , (-d_regData->Beta));
d_regData->alfa2 = (d_regData->I2 - X) * powf(d_regData->I2 , (-d_regData->Beta));
if(S >= d_regData->I2)
return S - X;
else
{
return ( d_regData->alfa2 * powf(S , d_regData->Beta) -
d_regData->alfa2 * phi(d_regData, S, d_regData->t1, d_regData->Beta, d_regData->I2, d_regData->I2, r, b, v) +
phi(d_regData, S, d_regData->t1, 1, d_regData->I2, d_regData->I2, r, b, v) -
phi(d_regData, S, d_regData->t1, 1, d_regData->I1, d_regData->I2, r, b, v) -
X * phi(d_regData, S, d_regData->t1, 0, d_regData->I2, d_regData->I2, r, b, v) +
X * phi(d_regData, S, d_regData->t1, 0, d_regData->I1, d_regData->I2, r, b, v) +
d_regData->alfa1 * phi(d_regData, S, d_regData->t1, d_regData->Beta, d_regData->I1, d_regData->I2, r, b, v) -
d_regData->alfa1 * ksi(d_regData, S, T, d_regData->Beta, d_regData->I1, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) +
ksi(d_regData, S, T, 1, d_regData->I1, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) -
ksi(d_regData, S, T, 1, X, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) -
X * ksi(d_regData, S, T, 0, d_regData->I1, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) +
X * ksi(d_regData, S, T, 0, X, d_regData->I2, d_regData->I1, d_regData->t1, r, b, v) );
}
}
}
// The cumulative bivariate normal distribution function
device inline void BSAmericanApprox2002(RegTemp* d_regData, float& call,float& put, float& S , float& X , float& T ,float& r ,float& b ,float& v )
{
call = BSAmericanCallApprox2002PC(d_regData, S, X, T, r, b, v);
// Use the Bjerksund and Stensland put-call transformation
put = BSAmericanCallApprox2002PC(d_regData, X, S, T, r-b, -b, v);
}
////////////////////////////////////////////////////////////////////////////////
//Process an array of optN options on GPU
////////////////////////////////////////////////////////////////////////////////
global void BSAmericanApprox2002GPU(float *d_CallResult, float *d_PutResult,
float *d_StockPrice, float *d_OptionStrike, float *d_OptionYears,
float *d_Riskfree, float *d_b, float *d_OptionVols, int optN)
{
//Thread index
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
//Total number of threads in execution grid
const int THREAD_N = blockDim.x * gridDim.x;
//No matter how small is execution grid or how large OptN is,
//exactly OptN indices will be processed with perfect memory coalescing
for(int opt = tid; opt < optN; opt += THREAD_N)
{
RegTemp data = d_regData[tid];
BSAmericanApprox2002(
&data,
d_CallResult[opt],
d_PutResult[opt],
d_StockPrice[opt],
d_OptionStrike[opt],
d_OptionYears[opt],
d_Riskfree[opt],
d_b[opt],
d_OptionVols[opt]
);
}
}
#endif /* BSAMERICANAPPROX2002CD_KERNEL_CUH_ */
#include <cutil_inline.h>
#include
device constant float __XX[11][4] = {
{ 0,0,0,0 },
{ 0,-0.932469514203152,-0.981560634246719,-0.993128599185095 },
{ 0,-0.661209386466265,-0.904117256370475,-0.963971927277914 },
{ 0,-0.238619186083197,-0.769902674194305,-0.912234428251326 },
{ 0,0,-0.587317954286617,-0.839116971822219 },
{ 0,0,-0.36783149899818,-0.746331906460151 },
{ 0,0,-0.125233408511469,-0.636053680726515 },
{ 0,0,0,-0.510867001950827 },
{ 0,0,0,-0.37370608871542 },
{ 0,0,0,-0.227785851141645 },
{ 0,0,0,-0.0765265211334973 }};
device constant float __W[11][4] = {
{ 0,0,0,0 },
{ 0,0.17132449237917,0.0471753363865118,0.0176140071391521 },
{ 0,0.360761573048138,0.106939325995318,0.0406014298003869 },
{ 0,0.46791393457269,0.160078328543346,0.0626720483341091 },
{ 0,0,0.203167426723066,0.0832767415767048 },
{ 0,0,0.233492536538355,0.10193011981724 },
{ 0,0,0.249147045813403,0.118194531961518 },
{ 0,0,0,0.131688638449177 },
{ 0,0,0,0.142096109318382 },
{ 0,0,0,0.149172986472604 },
{ 0,0,0,0.152753387130726 }};
device constant float fone = 1.0;
device constant float ftwo = 2.0;
device constant float ffive = 5.0;
device constant float fzpfive = 0.5;
device constant float CND_c00 = 2.0;
device constant float CND_c01 = 7.07106781186547;
device constant float CND_c02 = 0.0352624965998911;
device constant float CND_c03 = 0.700383064443688;
device constant float CND_c04 = 6.37396220353165;
device constant float CND_c05 = 33.912866078383;
device constant float CND_c06 = 112.079291497871;
device constant float CND_c07 = 221.213596169931;
device constant float CND_c08 = 220.206867912376;
device constant float CND_c09 = 0.0883883476483184;
device constant float CND_c10 = 1.75566716318264;
device constant float CND_c11 = 16.064177579207;
device constant float CND_c12 = 86.7807322029461;
device constant float CND_c13 = 296.564248779674;
device constant float CND_c14 = 637.333633378831;
device constant float CND_c15 = 793.826512519948;
device constant float CND_c16 = 440.413735824752;
device constant float CND_c17 = 0.65;
device constant float CND_c18 = 4.0;
device constant float CND_c19 = 3.0;
device constant float CND_c20 = 2.0;
device constant float CND_c21 = 1.0;
device constant float CND_c22 = 2.506628274631;
device constant float CND_c23 = 1.0;
device constant const float PI = 3.141592653589793;
typedef struct align(16)
{
//CND
float y , Exponential , SumA, SumB ;
float cnd;
//calcBSCallPrice
float d1;
//CBND
int i , ISs ;
int LG , NG ;
float h , k , hk , hs ,BVN ,Ass , asr , sn;
float A ,b , bs , c , d ;
float xs , rs;
//ksi
float rho , kappa;
//phi
//float d;
//BSAmericanCallApprox2002PC
float BInfinity , B0 ;
float ht1 , ht2 , I1 , I2;
float alfa1 , alfa2 , Beta , t1 ;
float m1, m2;
} RegTemp;
device RegTemp d_regData[32*16];
#include “BSAmericanApprox2002CD_kernel.cuh”
extern “C”
void BSCD_Init(int algType, unsigned int len, float *S, float *X, float *T, float *r, float b, float v,
float calls,float puts)
{
int dimGrid = 32;
int dimBlock = 16;
int OPT_SZ = len * sizeof(float);
//'d_' prefix - GPU (device) memory space
float
//Results calculated by GPU
*d_CallResult,
*d_PutResult,
//GPU instance of input data
*d_StockPrice,
*d_OptionStrike,
*d_OptionYears,
*d_OptionRate,
*d_OptionB,
*d_OptionVols;
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilSafeCall( cudaMalloc((void **)&d_CallResult, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_PutResult, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_StockPrice, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionStrike, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionYears, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionRate, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionB, OPT_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_OptionVols, OPT_SZ) );
//Copy options data to GPU memory for further processing
cutilSafeCall( cudaMemcpy(d_StockPrice, S, OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionStrike, X, OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionYears, T, OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionRate, r, OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionB, b, OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_OptionVols, v, OPT_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaThreadSynchronize() );
for(int i = 0; i < 1; i++) //one iteration
{
BSAmericanApprox2002GPU<<<dimGrid, dimBlock>>>(
//d_regData,
d_CallResult,
d_PutResult,
d_StockPrice,
d_OptionStrike,
d_OptionYears,
d_OptionRate,
d_OptionB,
d_OptionVols,
len
);
}
cutilSafeCall( cudaThreadSynchronize() );
//Read back GPU results to compare them to CPU results
cutilSafeCall( cudaMemcpy(calls, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost) );
cutilSafeCall( cudaMemcpy(puts, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost) );
//Release memory
cutilSafeCall( cudaFree(d_PutResult) );
cutilSafeCall( cudaFree(d_CallResult) );
cutilSafeCall( cudaFree(d_StockPrice) );
cutilSafeCall( cudaFree(d_OptionStrike) );
cutilSafeCall( cudaFree(d_OptionYears) );
cutilSafeCall( cudaFree(d_OptionRate) );
cutilSafeCall( cudaFree(d_OptionB) );
cutilSafeCall( cudaFree(d_OptionVols) );
cudaThreadExit();
}