2 dimensional grid

Hi,
can anybody help me to convert CUDA SDK’s (binomial options) one dimensiional kernel to a two dimensional kernel so that i can take options more than 65535

the kernel is called like this:
binomialOptionsKernel<<<optN, CACHE_SIZE>>>();
where optN = number of options, CACHE_SIZE=256

the kernel function is defined like this:
static global void binomialOptionsKernel(){
shared real callA[CACHE_SIZE];
shared real callB[CACHE_SIZE];
//Global memory frame for current option (thread block)
real *const d_Call = &d_CallBuffer[blockIdx.x * (NUM_STEPS + 16)];

const int tid = threadIdx.x;
const real S = d_OptionData[blockIdx.x].S;
const real X = d_OptionData[blockIdx.x].X;
const real vDt = d_OptionData[blockIdx.x].vDt;
const real puByDf = d_OptionData[blockIdx.x].puByDf;
const real pdByDf = d_OptionData[blockIdx.x].pdByDf;

//Compute values at expiry date
for(int i = tid; i <= NUM_STEPS; i += CACHE_SIZE)
d_Call[i] = expiryCallValue(S, X, vDt, i);

//Walk down binomial tree
//So double-buffer and synchronize to avoid read-after-write hazards.
for(int i = NUM_STEPS; i > 0; i -= CACHE_DELTA)
for(int c_base = 0; c_base < i; c_base += CACHE_STEP){
//Start and end positions within shared memory cache
int c_start = min(CACHE_SIZE - 1, i - c_base);
int c_end = c_start - CACHE_DELTA;

//Read data(with apron) to shared memory
__syncthreads();
if(tid <= c_start)
callA[tid] = d_Call[c_base + tid];

//Calculations within shared memory
for(int k = c_start - 1; k >= c_end;){
//Compute discounted expected value
__syncthreads();
if(tid <= k)
callB[tid] = puByDf * callA[tid + 1] + pdByDf * callA[tid];
k–;

//Compute discounted expected value
__syncthreads();
if(tid <= k)
callA[tid] = puByDf * callB[tid + 1] + pdByDf * callB[tid];
k–;
}

//Flush shared memory cache
__syncthreads();
if(tid <= c_end)
d_Call[c_base + tid] = callA[tid];
}

//Write the value at the top of the tree to destination buffer
if(threadIdx.x == 0) d_CallValue[blockIdx.x] = (float)callA[0];
}

Thanks in advance

First, replace optN with a two dimensional grid, maybe like this:

int rootOptN = sqrt(optN);

dim3 theGrid(rootOptN, (optN+rootOptN-1)/rootOptN);

binomialOptionsKernel<<<theGrid, CACHE_SIZE>>>();

Then, within the kernel, calculate the block number like so

int blockNumber = blockIdx.y*gridDim.x + blockIdx.x

The block number is basically a one-dimensional block ID which is not constrained to 65535. Then everywhere you see blockIdx.x, replace it with blockNumber.

Also, since you may have extra blocks, you will probably want to pass the original optN to your kernel, and then

if (blockNumber >= optN) {

  return;

}

Otherwise you may overflow your buffers and crash.

Thank You for help, it really worked.

we were using this logic:

if (optN < 65536) {

xdim = optN;

ydim = 1;

} 

else

{

xdim = 65536;

ydim = (optN + 65536-1)/65536;

}

const dim3 dimGrid(xdim, ydim);

binomialOptionsKernel<<<dimGrid, CACHE_SIZE>>>();

This was the error what I got:

Running GPU binomial tree…

cutilCheckMsg() CUTIL CUDA error: binomialOptionsKernel() execution failed.

in file <binomialOptions_kernel.cuh>, line 219 : invalid configuration argument.

Can you tell me why it is not working.

I can go only upto 161000 options. If I try giving more than that it results in a segmentation fault. Can you tell me why is it happening so?

I believe the limit is actually 65535 (largest allowed number), not 65536.

k… right now am using the code which you have sent, it works fine for 161000.

for options greater than 161000 I get a “segmentation fault” for the gpu, so could you tell me what can I do for that.

Thanks

Om

Split it into multiple kernel launches if u find urself against a hardware limit

Hi am using Quadro FX5800.

I am launching the kernel in this way.

[codebox]int rootOptN = sqrt(optN);

dim3 theGrid(rootOptN, (optN+rootOptN-1)/rootOptN);

binomialOptionsKernel<<<theGrid, CACHE_SIZE>>>(optN);

[/codebox]

this is my kernel function:

[codebox]static global void binomialOptionsKernel(const unsigned int optN){

__shared__ real callA[CACHE_SIZE];

__shared__ real callB[CACHE_SIZE];

const int BlockIndex = blockIdx.y * gridDim.x + blockIdx.x;

//const int ThreadIndex = BlockIndex * blockDim.x + threadIdx.x;

//Global memory frame for current option (thread block)

if (BlockIndex >= optN) {

 return;

   }

real *const d_Call = &d_CallBuffer[BlockIndex* (NUM_STEPS + 16)];

const int tid = threadIdx.x;

//const int       thid= blockIdx.x*blockDim.x+threadIdx.x;

// const int bid = gridDim.x*blockDim.x;

//const int bd = blockDim.x*Grid;

//const int bd = blockDim.xgridDim.xgridDim.y;

const real      S = d_OptionData[BlockIndex].S;

const real      X = d_OptionData[BlockIndex].X;

const real    vDt = d_OptionData[BlockIndex].vDt;

const real puByDf = d_OptionData[BlockIndex].puByDf;

const real pdByDf = d_OptionData[BlockIndex].pdByDf;

// for(int thid=blockIdx.x*blockDim.x+threadIdx.x; thid <=bd ; thid += bid){

//Compute values at expiry date

for(int i = tid; i <= NUM_STEPS; i += CACHE_SIZE)

    d_Call[i] = expiryCallValue(S, X, vDt, i);

//Walk down binomial tree

//So double-buffer and synchronize to avoid read-after-write hazards.

for(int i = NUM_STEPS; i > 0; i -= CACHE_DELTA)

    for(int c_base = 0; c_base < i; c_base += CACHE_STEP){

        //Start and end positions within shared memory cache

        int c_start = min(CACHE_SIZE - 1, i - c_base);

        int c_end   = c_start - CACHE_DELTA;

//Read data(with apron) to shared memory

        __syncthreads();

        if(tid <= c_start)

            callA[tid] = d_Call[c_base + tid];

//Calculations within shared memory

        for(int k = c_start - 1; k >= c_end;){

            //Compute discounted expected value

            __syncthreads();

            if(tid <= k)

                callB[tid] = puByDf * callA[tid + 1] + pdByDf * callA[tid];

            k--;

//Compute discounted expected value

            __syncthreads();

            if(tid <= k)

                callA[tid] = puByDf * callB[tid + 1] + pdByDf * callB[tid];

            k--;

        }

//Flush shared memory cache

        __syncthreads();

        if(tid <= c_end)

            d_Call[c_base + tid] = callA[tid];

}

//Write the value at the top of the tree to destination buffer

if(threadIdx.x == 0) d_CallValue[BlockIndex] = (float)callA[0];

}[/codebox]

I am not able to use a total number of blocks greater than 161000. (blocks=number of options(optN)).

I am getting a segmentation fault if I use the total number of blocks to be greater than 161000.

can anybody tell me solution for this.