Hi,
can anybody help me to convert CUDA SDK’s (binomial options) one dimensiional kernel to a two dimensional kernel so that i can take options more than 65535
the kernel is called like this:
binomialOptionsKernel<<<optN, CACHE_SIZE>>>();
where optN = number of options, CACHE_SIZE=256
the kernel function is defined like this:
static global void binomialOptionsKernel(){
shared real callA[CACHE_SIZE];
shared real callB[CACHE_SIZE];
//Global memory frame for current option (thread block)
real *const d_Call = &d_CallBuffer[blockIdx.x * (NUM_STEPS + 16)];
const int tid = threadIdx.x;
const real S = d_OptionData[blockIdx.x].S;
const real X = d_OptionData[blockIdx.x].X;
const real vDt = d_OptionData[blockIdx.x].vDt;
const real puByDf = d_OptionData[blockIdx.x].puByDf;
const real pdByDf = d_OptionData[blockIdx.x].pdByDf;
//Compute values at expiry date
for(int i = tid; i <= NUM_STEPS; i += CACHE_SIZE)
d_Call[i] = expiryCallValue(S, X, vDt, i);
//Walk down binomial tree
//So double-buffer and synchronize to avoid read-after-write hazards.
for(int i = NUM_STEPS; i > 0; i -= CACHE_DELTA)
for(int c_base = 0; c_base < i; c_base += CACHE_STEP){
//Start and end positions within shared memory cache
int c_start = min(CACHE_SIZE - 1, i - c_base);
int c_end = c_start - CACHE_DELTA;
//Read data(with apron) to shared memory
__syncthreads();
if(tid <= c_start)
callA[tid] = d_Call[c_base + tid];
//Calculations within shared memory
for(int k = c_start - 1; k >= c_end;){
//Compute discounted expected value
__syncthreads();
if(tid <= k)
callB[tid] = puByDf * callA[tid + 1] + pdByDf * callA[tid];
k–;
//Compute discounted expected value
__syncthreads();
if(tid <= k)
callA[tid] = puByDf * callB[tid + 1] + pdByDf * callB[tid];
k–;
}
//Flush shared memory cache
__syncthreads();
if(tid <= c_end)
d_Call[c_base + tid] = callA[tid];
}
//Write the value at the top of the tree to destination buffer
if(threadIdx.x == 0) d_CallValue[blockIdx.x] = (float)callA[0];
}
Thanks in advance