The results of my CUDA code are absurds

Hello!!!

I wrote a code for computing the value of a vanilla call using Monte Carlo (you don’t have to know what it is to help me :) ). The results that I get are absurds in the sense that the prices are sometimes negatives, which is normaly impossible and sometimes very high (which is also very weird). I think that the mistake that I made is related to the sizes of the arrays. But I don’t know where it is. I have been trying to debug it for days without success. So I was hoping you could help me with it. Thanks in advance.

Here is the code:

#include <curand_kernel.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <curand.h>
#include <time.h>

#define NBLOCKS 128
#define NTHREADS 128
#define NUMSIM 1e05
#define MAX 10000000

global void inistate(unsigned int initialSeed, curandState * state)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(initialSeed, tid, 0, & state[tid]);
}

device inline float getUdlValueAtMaturity(const float & S0, const float & drift, const float & diffusion, curandState & state)
{
return S0*expf(drift + diffusion * curand_normal(& state));
}

device inline float getPayoff(const float & ST, const float & K)
{
return max(ST - K, 0.0f);
}

struct Option
{
float S0;
float K;
float r;
float vol;
float T;
float premium;

enum OptionType
{
Call,
Put
};
OptionType optionType;
};

device float reduce_sum(double in)
{
shared float sdata[NTHREADS];

// - Write to shared memory
unsigned int ltid = threadIdx.x;

sdata[ltid] = in;
__syncthreads();

for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
{
    if (ltid < s)
    {
        sdata[ltid] += sdata[ltid + s];
    }

    __syncthreads();
}

return sdata[0];

}

global void do_mc_simul(curandState * state, float * result, Option * option, const unsigned int n)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int dim = gridDim.x * blockDim.x;

float drift = (option->r - 0.5f*powf(option->vol, 2.0f))*option->T;
float diffusion = option->vol * sqrtf(option->T);

curandState tidState = state[tid];

float sumPayoff = 0.0f;
float ST = 0.0f;

for(unsigned int i = tid; i < n; i += dim)
{
ST = getUdlValueAtMaturity(option->S0, drift, diffusion, tidState);
sumPayoff += getPayoff(ST, option->K);
}

sumPayoff = reduce_sum(sumPayoff);

if(threadIdx.x == 0)
{
result[dim] = sumPayoff;
}
}

int main()
{

srand(time(NULL));

dim3 block;
dim3 grid;
block.x = (unsigned int) NTHREADS;
grid.x  = (unsigned int) (NUMSIM + NTHREADS - 1) / (NTHREADS);

unsigned int initialSeed = (unsigned int) (rand() % (MAX + 1));

Option * devCallOption;
float * devResults = 0;

printf(“Seed : %d.\n”, initialSeed);

// Allocate memory on the CPU for the Call Option
Option * hostCallOption = (Option *) malloc(sizeof(Option));
float * hostResults = (float *) malloc(grid.x * sizeof(float));

curandState * devState = 0;
cudaMalloc((void **) & devState, grid.x * block.x * sizeof(curandState));
cudaMalloc((void **) & devResults, grid.x * sizeof(float));
cudaMalloc((void **) & devCallOption, sizeof(Option));

hostCallOption->S0 = 50.0f;
hostCallOption->K = 44.0f;
hostCallOption->r = 0.05f;
hostCallOption->vol = sqrtf(0.1f);
hostCallOption->T = 1.0f;
hostCallOption->premium = 0.0f;

printf(“S0 : %f. \n”, hostCallOption->S0);
printf(“K : %f. \n”, hostCallOption->K);
printf(“r : %f. \n”, hostCallOption->r);
printf(“vol : %f. \n”, hostCallOption->vol);
printf(“T : %f. \n”, hostCallOption->T);
printf(“premium : %f. \n”, hostCallOption->premium);

cudaMemcpy(devCallOption, hostCallOption, sizeof(Option), cudaMemcpyHostToDevice);

inistate<<<grid, block>>>(initialSeed, devState);

do_mc_simul<<<grid, block, block.x * sizeof(float)>>>(devState, devResults, devCallOption, NUMSIM);

cudaMemcpy(hostResults, devResults, grid.x * sizeof(float), cudaMemcpyDeviceToHost);

printf(“value of gridx : %d\n”, grid.x);

for(unsigned int i = 0; i < grid.x; ++i)
{
printf(“host result %d : %f\n”, i, hostResults[i]);
hostCallOption->premium += hostResults[i];
}

hostCallOption->premium /= NUMSIM;

hostCallOption->premium *= expf(-hostCallOption->r * hostCallOption->T);

printf(“The price of the call is %f.\n\n”, hostCallOption->premium);

cudaFree(devCallOption);
cudaFree(devResults);
cudaFree(devState);

free(hostCallOption);
free(hostResults);
}

Check all return codes from CUDA functions for errors.

Could you be more specific? I don’t understand what you mean. Thanks.

cudaError_t err = cudaMemcpy(hostResults, devResults, grid.x * sizeof(float), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
    fprintf(stderr, "cudaMemcpy returned %s\n", cudaGetErrorString(err));

and do that for every CUDA call.

It’s easy to do with the help of a little macro

#define CHECK_CUDA_ERROR(x) {cudaError_t err__ = (x); if (err__!=cudaSuccess) fprintf(stderr, #x " returned \"%s\"\n", cudaGetErrorString(err__));}

and wrapping all CUDA calls into it.

Compile your code with extra flags -g -G to add debug information and then run the cuda-memcheck and cuda-gdb programs to debug them. In my experience when I was accessing out of bonds I was often getting some huge negative numbers.