I am simulating stock price paths using Geometric Brownian Motion on my GPU NVIDIA GeForce 840M, and my code successfully does this for a relatively small number of paths (150). However, when I try to increase the number of paths beyond this point, I get a stack overflow error. I have my error message below (and the code breaks at the first line of the main method so there is not much help from there) and I suspect the problem is one of the following:

- I am somehow creating an excessive number of threads or overloading 1 block with too many threads
- My array of doubles is too large to be on the stack

Does anyone know which of these might be the case based on the code below?

The problem is likely in the main method (when the num_blocks is declared) or create_paths_kernel method.

Error Message:

Unhandled exception at 0x00007FF790625A88 in Test.exe: 0xC00000FD: Stack overflow (parameters: 0x0000000000000001, 0x00000070F34F3000).

Code

#include <cuda_runtime.h>

#include “device_launch_parameters.h”

#include <stdio.h>

#include

#include

#include

#include<stdlib.h>

#include<assert.h>

#include <curand.h>

#include <cusolverDn.h>

#include <cublas_v2.h>

#include <cuda_runtime_api.h>

const int BLOCK_SIZE = 1024;

void createRandoms(double* numbers, const int N_PATHS, const int N_STEPS,const int N_NORMALS,double mu_normal,double sqrt_dt)

{

//Instantiate Random Number generator

curandGenerator_t curandGenerator;

//Create generated that use Mersenne Prime Twister

curandCreateGenerator(&curandGenerator, CURAND_RNG_PSEUDO_MTGP32);

//Long unsigned int will be used as a seed

curandSetPseudoRandomGeneratorSeed(curandGenerator, 1234ULL);

//Generate numbers

curandGenerateNormalDouble(curandGenerator,numbers, N_NORMALS,mu_normal, sqrt_dt);

}

**global** void create_paths_kernel(int N_STEPS,

int N_PATHS,

double dt,

double S0,

double r,

double sigma,

double mu,

double* rands,

double* paths)

{

// The path generated by this thread.

//blockDim.x = threads per block

int path = blockIdx.x*blockDim.x + threadIdx.x;

```
// Early exit.
if (path >= N_PATHS)
return;
// Intatiate price at t=0 as S0
double S = S0;
// Offset is used to ensure that each iteration in the for loop is on the current path
int offset = path;
// Generate several timesteps per thread
// Generate Asset Prices using Geometric Brownian Motion, i.e. dS_t = S_t(u*dt + sigma*dW_t)
// Discretized at each time step
for (int timestep = 0; timestep < N_STEPS; ++timestep,offset+=N_PATHS)
{
S = S + mu*S*dt + sigma*S*rands[offset];
paths[offset] = S;
}
__syncthreads();
```

}

int main()

{

```
//Market Parameters
const int N_PATHS = 1000; //Number of Paths
const int N_STEPS = 365; // Number of Steps (Days)
const int N_NORMALS = N_PATHS*N_STEPS;
double T = 1.0;
double K = 100.0;
double S0 = 100.0;
double sigma = 0.2;
double mu = 0.1;
double r = 0.05;
double dt = T / double(N_STEPS);
double sqrt_dt = sqrt(dt);
//double mu_normal = 0.0; (Brownian drift)
double e_neg_rt = exp(-r*dt); //Discount
int degree = 2;
double* rand_norms_d;
cudaMalloc(&rand_norms_d, N_NORMALS * sizeof(double));
double* paths_d;
cudaMalloc(&paths_d, N_NORMALS * sizeof(double));
double* op_values_d;
cudaMalloc(&op_values_d, N_NORMALS * sizeof(double));
createRandoms(rand_norms_d,N_PATHS,N_STEPS,N_NORMALS,0,sqrt_dt);
cudaFree(rand_norms_d);
//Get the total number of blocks required to have one thread per path
int numblocks = (int)ceil((double)N_PATHS/(double)BLOCK_SIZE);
//create asset price paths
create_paths_kernel<<<numblocks,BLOCK_SIZE>>>(N_STEPS,N_PATHS,dt,S0,r,sigma,mu,rand_norms_d,paths_d);
//get intrinsic values
//assign one thread to each element in the vector
double paths[N_NORMALS];
//Copies paths back to host
cudaMemcpy(paths, paths_d, sizeof(double)*N_NORMALS, cudaMemcpyDeviceToHost);
cudaDeviceReset();
return 0;
```

}