Hi,
I wanted to change the NVIDIA SDK’s BINOMIAL OPTIONS kernel so that each thread can handle one option.
earlier one block was calculating one option.
can anybody help me…
[codebox]#define TIME_STEPS 16
#define CACHE_DELTA (2 * TIME_STEPS)
#define CACHE_SIZE (256)
#define CACHE_STEP (CACHE_SIZE - CACHE_DELTA)
#if NUM_STEPS % CACHE_DELTA
#error Bad constants
//Preprocessed input option data
typedef struct{
real S;
real X;
real vDt;
real puByDf;
real pdByDf;
} __TOptionData;
static device __TOptionData d_OptionData[MAX_OPTIONS];
static device float d_CallValue[MAX_OPTIONS];
static device real d_CallBuffer[MAX_OPTIONS * (NUM_STEPS + 16)];
////////////////////////////////////////////////////////////////////////////////
// Overloaded shortcut functions for different precision modes
////////////////////////////////////////////////////////////////////////////////
#ifndef DOUBLE_PRECISION
device inline float expiryCallValue(float S, float X, float vDt, int i){
real d = S * expf(vDt * (2.0f * i - NUM_STEPS)) - X;
return (d > 0) ? d : 0;
}
device inline double expiryCallValue(double S, double X, double vDt, int i){
double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X;
return (d > 0) ? d : 0;
}
////////////////////////////////////////////////////////////////////////////////
// GPU kernel
////////////////////////////////////////////////////////////////////////////////
static global void binomialOptionsKernel(const unsigned int optN){
__shared__ real callA[CACHE_SIZE];
__shared__ real callB[CACHE_SIZE];
const int BlockIndex = blockIdx.y * gridDim.x + blockIdx.x;
//Global memory frame for current option (thread block)
if (BlockIndex >= optN) {
return;
}
real *const d_Call = &d_CallBuffer[BlockIndex* (NUM_STEPS + 16)];
const int tid=threadIdx.x;
const real S = d_OptionData[BlockIndex].S;
const real X = d_OptionData[BlockIndex].X;
const real vDt = d_OptionData[BlockIndex].vDt;
const real puByDf = d_OptionData[BlockIndex].puByDf;
const real pdByDf = d_OptionData[BlockIndex].pdByDf;
//Compute values at expiry date
for(int i = tid; i <= NUM_STEPS; i += CACHE_SIZE)
d_Call[i] = expiryCallValue(S, X, vDt, i);
//Walk down binomial tree
//So double-buffer and synchronize to avoid read-after-write hazards.
for(int i = NUM_STEPS; i > 0; i -= CACHE_DELTA)
for(int c_base = 0; c_base < i; c_base += CACHE_STEP){
//Start and end positions within shared memory cache
int c_start = min(CACHE_SIZE - 1, i - c_base);
int c_end = c_start - CACHE_DELTA;
//Read data(with apron) to shared memory
__syncthreads();
if(tid <= c_start)
callA[tid] = d_Call[c_base + tid];
//Calculations within shared memory
for(int k = c_start - 1; k >= c_end;){
//Compute discounted expected value
__syncthreads();
if(tid <= k)
callB[tid] = puByDf * callA[tid + 1] + pdByDf * callA[tid];
k--;
//Compute discounted expected value
__syncthreads();
if(tid <= k)
callA[tid] = puByDf * callB[tid + 1] + pdByDf * callB[tid];
k--;
}
//Flush shared memory cache
__syncthreads();
if(tid <= c_end)
d_Call[c_base + tid] = callA[tid];
}
//Write the value at the top of the tree to destination buffer
if(threadIdx.x == 0) d_CallValue[BlockIndex] = (float)callA[0];
}
//CudaVisualProfiler
////////////////////////////////////////////////////////////////////////////////
// Host-side interface to GPU binomialOptions
////////////////////////////////////////////////////////////////////////////////
static void binomialOptionsGPU(
float *callValue,
TOptionData *optionData,
int optN
){
printf("kernel execution..........");
fflush(stdout);
static __TOptionData h_OptionData[MAX_OPTIONS];
for(int i = 0; i < optN; i++){
const double T = optionData[i].T;
const double R = optionData[i].R;
const double V = optionData[i].V;
const double dt = T / (double)NUM_STEPS;
const double vDt = V * sqrt(dt);
const double rDt = R * dt;
//Per-step interest and discount factors
const double If = exp(rDt);
const double Df = exp(-rDt);
//Values and pseudoprobabilities of upward and downward moves
const double u = exp(vDt);
const double d = exp(-vDt);
const double pu = (If - d) / (u - d);
const double pd = 1.0 - pu;
const double puByDf = pu * Df;
const double pdByDf = pd * Df;
h_OptionData[i].S = (real)optionData[i].S;
h_OptionData[i].X = (real)optionData[i].X;
h_OptionData[i].vDt = (real)vDt;
h_OptionData[i].puByDf = (real)puByDf;
h_OptionData[i].pdByDf = (real)pdByDf;
}
cutilSafeCall(cudaThreadSynchronize());
cutilSafeCall( cudaMemcpyToSymbol(d_OptionData, h_OptionData, optN * sizeof(__TOptionData)) );
int rootoptN = sqrt(optN);
dim3 theGrid(rootoptN, (optN+rootoptN-1)/rootoptN);
printf(“xdim=%d\n,ydim=%d\n”,rootoptN ,(optN+rootoptN-1)/rootoptN);
fflush(stdout);
binomialOptionsKernel<<<theGrid, CACHE_SIZE>>>(optN);
printf(“Done with kernel call\n”); fflush(stdout);
//binomialOptionsKernel<<<dimGrid, CACHE_SIZE>>>(optN);
//binomialOptionsKernel<<<optN, CACHE_SIZE>>>();
cutilCheckMsg(“binomialOptionsKernel() execution failed.\n”);
cutilSafeCall(cudaThreadSynchronize());
cutilSafeCall( cudaMemcpyFromSymbol(callValue, d_CallValue, optN * sizeof(float)) );
}[/codebox]