Hi,

I wanted to change the NVIDIA SDK’s BINOMIAL OPTIONS kernel so that each thread can handle one option.

earlier one block was calculating one option.

can anybody help me…

[codebox]#define TIME_STEPS 16

#define CACHE_DELTA (2 * TIME_STEPS)

#define CACHE_SIZE (256)

#define CACHE_STEP (CACHE_SIZE - CACHE_DELTA)

#if NUM_STEPS % CACHE_DELTA

```
#error Bad constants
```

#endif

//Preprocessed input option data

typedef struct{

```
real S;
real X;
real vDt;
real puByDf;
real pdByDf;
```

} __TOptionData;

static **device** __TOptionData d_OptionData[MAX_OPTIONS];

static **device** float d_CallValue[MAX_OPTIONS];

static **device** real d_CallBuffer[MAX_OPTIONS * (NUM_STEPS + 16)];

////////////////////////////////////////////////////////////////////////////////

// Overloaded shortcut functions for different precision modes

////////////////////////////////////////////////////////////////////////////////

#ifndef DOUBLE_PRECISION

**device** inline float expiryCallValue(float S, float X, float vDt, int i){

```
real d = S * expf(vDt * (2.0f * i - NUM_STEPS)) - X;
return (d > 0) ? d : 0;
```

}

#else

**device** inline double expiryCallValue(double S, double X, double vDt, int i){

```
double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X;
return (d > 0) ? d : 0;
```

}

#endif

////////////////////////////////////////////////////////////////////////////////

// GPU kernel

////////////////////////////////////////////////////////////////////////////////

static **global** void binomialOptionsKernel(const unsigned int optN){

```
__shared__ real callA[CACHE_SIZE];
__shared__ real callB[CACHE_SIZE];
```

const int BlockIndex = blockIdx.y * gridDim.x + blockIdx.x;

//Global memory frame for current option (thread block)

```
if (BlockIndex >= optN) {
return;
}
real *const d_Call = &d_CallBuffer[BlockIndex* (NUM_STEPS + 16)];
```

const int tid=threadIdx.x;

const real S = d_OptionData[BlockIndex].S;

```
const real X = d_OptionData[BlockIndex].X;
const real vDt = d_OptionData[BlockIndex].vDt;
const real puByDf = d_OptionData[BlockIndex].puByDf;
const real pdByDf = d_OptionData[BlockIndex].pdByDf;
```

//Compute values at expiry date

```
for(int i = tid; i <= NUM_STEPS; i += CACHE_SIZE)
d_Call[i] = expiryCallValue(S, X, vDt, i);
```

//Walk down binomial tree

```
//So double-buffer and synchronize to avoid read-after-write hazards.
for(int i = NUM_STEPS; i > 0; i -= CACHE_DELTA)
for(int c_base = 0; c_base < i; c_base += CACHE_STEP){
//Start and end positions within shared memory cache
int c_start = min(CACHE_SIZE - 1, i - c_base);
int c_end = c_start - CACHE_DELTA;
```

//Read data(with apron) to shared memory

```
__syncthreads();
if(tid <= c_start)
callA[tid] = d_Call[c_base + tid];
```

//Calculations within shared memory

```
for(int k = c_start - 1; k >= c_end;){
//Compute discounted expected value
__syncthreads();
if(tid <= k)
callB[tid] = puByDf * callA[tid + 1] + pdByDf * callA[tid];
k--;
```

//Compute discounted expected value

```
__syncthreads();
if(tid <= k)
callA[tid] = puByDf * callB[tid + 1] + pdByDf * callB[tid];
k--;
}
```

//Flush shared memory cache

```
__syncthreads();
if(tid <= c_end)
d_Call[c_base + tid] = callA[tid];
}
```

//Write the value at the top of the tree to destination buffer

```
if(threadIdx.x == 0) d_CallValue[BlockIndex] = (float)callA[0];
```

}

//CudaVisualProfiler

////////////////////////////////////////////////////////////////////////////////

// Host-side interface to GPU binomialOptions

////////////////////////////////////////////////////////////////////////////////

static void binomialOptionsGPU(

```
float *callValue,
TOptionData *optionData,
int optN
```

){

```
printf("kernel execution..........");
fflush(stdout);
static __TOptionData h_OptionData[MAX_OPTIONS];
```

for(int i = 0; i < optN; i++){

```
const double T = optionData[i].T;
const double R = optionData[i].R;
const double V = optionData[i].V;
```

const double dt = T / (double)NUM_STEPS;

```
const double vDt = V * sqrt(dt);
const double rDt = R * dt;
//Per-step interest and discount factors
const double If = exp(rDt);
const double Df = exp(-rDt);
//Values and pseudoprobabilities of upward and downward moves
const double u = exp(vDt);
const double d = exp(-vDt);
const double pu = (If - d) / (u - d);
const double pd = 1.0 - pu;
const double puByDf = pu * Df;
const double pdByDf = pd * Df;
```

h_OptionData[i].S = (real)optionData[i].S;

```
h_OptionData[i].X = (real)optionData[i].X;
h_OptionData[i].vDt = (real)vDt;
h_OptionData[i].puByDf = (real)puByDf;
h_OptionData[i].pdByDf = (real)pdByDf;
}
```

cutilSafeCall(cudaThreadSynchronize());

```
cutilSafeCall( cudaMemcpyToSymbol(d_OptionData, h_OptionData, optN * sizeof(__TOptionData)) );
```

int rootoptN = sqrt(optN);

dim3 theGrid(rootoptN, (optN+rootoptN-1)/rootoptN);

printf(“xdim=%d\n,ydim=%d\n”,rootoptN ,(optN+rootoptN-1)/rootoptN);

fflush(stdout);

binomialOptionsKernel<<<theGrid, CACHE_SIZE>>>(optN);

printf(“Done with kernel call\n”); fflush(stdout);

//binomialOptionsKernel<<<dimGrid, CACHE_SIZE>>>(optN);

//binomialOptionsKernel<<<optN, CACHE_SIZE>>>();

cutilCheckMsg(“binomialOptionsKernel() execution failed.\n”);

```
cutilSafeCall(cudaThreadSynchronize());
cutilSafeCall( cudaMemcpyFromSymbol(callValue, d_CallValue, optN * sizeof(float)) );
```

}[/codebox]