problem related to grid monte carlo kernel

Hi ,

I want to change the Nvidia CUDA SDK’s monte carlo Kernel ,so that it can handle lakhs of options with one block having single thread initially. So that ,later I can change the number of blocks without changing the number of options.

In monte carlo we are using one of the two kernel functions MonteCarloKernel() or MonteCarloOneBlockPerOption() based on the condition doMultiBlock = (plan->pathN / plan->optionCount) >= 8192. where pathN= 1<<18 and optionCount=256.

[codebox] if(doMultiBlock){

    const int blocksPerOption = (plan->optionCount < 16) ? 64 : 16;

    const int          accumN = THREAD_N * blocksPerOption;

    const dim3 gridMain(blocksPerOption, plan->optionCount);

    MonteCarloKernel<<<gridMain, THREAD_N>>>(

        (__TOptionV  MonteCarloOneBlockPerOption<<<1, THREAD_N>>>(plan->optionCount,alue *)plan->d_Buffer,

        plan->d_Samples,

        plan->pathN

    );

    cutilCheckMsg("MonteCarloKernel() execution failed\n");

    MonteCarloReduce<<<plan->optionCount, THREAD_N>>>(

        (__TOptionValue *)plan->d_Buffer,

        accumN

    );

    cutilCheckMsg("MonteCarloReduce() execution failed\n");

}else{

        MonteCarloOneBlockPerOption<<<plan->optionCount, THREAD_N>>>(

        plan->d_Samples,

        plan->pathN

    );}[/codebox]

Kernel function for MonteCarloKernel() is

[codebox]static global void MonteCarloKernel(

__TOptionValue *d_Buffer,

float *d_Samples,

int pathN

){

const int optionIndex = blockIdx.y;

const real S = d_OptionData[optionIndex].S;

const real        X = d_OptionData[optionIndex].X;

const real    MuByT = d_OptionData[optionIndex].MuByT;

const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;

//One thread per partial integral

const int   iSum = blockIdx.x * blockDim.x + threadIdx.x;

const int accumN = blockDim.x * gridDim.x;

//Cycle through the entire samples array:

//derive end stock price for each path

//accumulate into intermediate global memory array

__TOptionValue sumCall = {0, 0};

for(int i = iSum; i < pathN; i += accumN){

    real              r = d_Samples[i];

    real      callValue = endCallValue(S, X, r, MuByT, VBySqrtT);

    sumCall.Expected   += callValue;

    sumCall.Confidence += callValue * callValue;

}

d_Buffer[optionIndex * accumN + iSum] = sumCall;

}

[/codebox]

The Kernel function for MonteCarloOneBlockPerOption() is

[codebox]static global void MonteCarloOneBlockPerOption(

float *d_Samples,

int pathN

){

const int SUM_N = THREAD_N;

__shared__ real s_SumCall[SUM_N];

__shared__ real s_Sum2Call[SUM_N];

const int optionIndex = blockIdx.x;

const real        S = d_OptionData[optionIndex].S;

const real        X = d_OptionData[optionIndex].X;

const real    MuByT = d_OptionData[optionIndex].MuByT;

const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;

//Cycle through the entire samples array:

//derive end stock price for each path

//accumulate partial integrals into intermediate shared memory buffer

for(int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x){

    __TOptionValue sumCall = {0, 0};

    for(int i = iSum; i < pathN; i += SUM_N){

        real              r = d_Samples[i];

        real      callValue = endCallValue(S, X, r, MuByT, VBySqrtT);

        sumCall.Expected   += callValue;

        sumCall.Confidence += callValue * callValue;

    }

    s_SumCall[iSum]  = sumCall.Expected;

    s_Sum2Call[iSum] = sumCall.Confidence;

}

//Reduce shared memory accumulators

//and write final result to global memory

sumReduce<real, SUM_N, THREAD_N>(s_SumCall, s_Sum2Call);

if(threadIdx.x == 0){

    __TOptionValue t = {s_SumCall[0], s_Sum2Call[0]};

    d_CallValue[optionIndex] = t;

}

}[/codebox]

Can any one give me some idea to do these changes.

Thank you in advance.