Hello,

I am currently implementing a linear interpolation in CUDA with the following code:

**global** void linearInterpolation(float* data, float* x_in, int lx_data) {

```
int j = threadIdx.x + blockDim.x * blockIdx.x;
x_in[j] = singleInterp(data, x_in[j], lx_data);
```

}

**device** float singleInterp(float* data, float x, int lx_data) {

```
int i =0;
float res = data[0];
while(data[i] <= x) {
i = i+1;
}
if (data[i] > x && i>0 && i<lx_data ) {
float p = __fdividef(data[i + lx_data] - data[i+lx_data-1],(data[i] - data[i-1]));
res = __fmaf_rn(p,x-data[i-1],data[i+lx_data-1]) ;
return res;
}
if (i == lx_data)
res = data[lx_data];
return res;
```

}

I am using the function feval from matlab to do the computation. I would like to optimize this code but I have not a lot of ideas. Could someone please help me on this issue? I could beat the function interp1 from Matlab only for some very huge numbers of interpolations.

I have another question regarding the way the threads are used in CUDA. Let’s say the following code which computes a Monte Carlo Simulation:

**device** float priceValue(float dt, float drift, float volatility,int j) {

```
unsigned int seed = (unsigned int) j;
curandState_t state;
curand_init(seed,0,0, &state);
float randomWalk = curand_normal(&state);
while (abs(randomWalk)> 1) {
randomWalk = curand_normal(&state);
}
return exp((drift - volatility*volatility*0.5)*dt + randomWalk*volatility*sqrt(dt));
```

}

**global** void Simulation(float* matrice, int nbreSimulation, int nPaths, float S0, float T, float drift, float volatility) {

int lx = (int)sqrt((float)sizeof(volatility)/sizeof(float));

float dt = T/nPaths;

int i = threadIdx.x + blockDim.x * blockIdx.x;

int j = threadIdx.y + blockDim.y * blockIdx.y;

int index = i + j*nbreSimulation;

if (index >= nbreSimulation)

matrice[index] = matrice[index - nbreSimulation]*priceValue(dt, drift,volatility , index);

}

And the following execution code :

spmd

kernelSimulation = parallel.gpu.CUDAKernel(‘Cuda_functions.ptx’, ‘Cuda_functions.cu’, ‘Simulation’);

kernelSimulation.ThreadBlockSize = 512;

nbreSimulation = 5;

nPaths = 1000;

S0 = 100;

T = 1.5;

drift = 0.05;

volatility = 0.25; % abs(gpuArray.randn(10,2,‘single’))/10;

matrice = S0*gpuArray.ones(nbreSimulation,nPaths, ‘single’);

[A] = feval(kernelSimulation,matrice, nbreSimulation, nPaths, S0,T, drift, volatility);

traj = gather(A);

end

The result is that the computations have not been done on the whole matrix (only the 511 st values of the matrix have been changed ) because of the number of threads that I have defined. How should I proceed to use for example only 512 threads on a 5x1000 matrix?

Many thanks in advance for your help