Hello,
I am currently implementing a linear interpolation in CUDA with the following code:
global void linearInterpolation(float* data, float* x_in, int lx_data) {
int j = threadIdx.x + blockDim.x * blockIdx.x;
x_in[j] = singleInterp(data, x_in[j], lx_data);
}
device float singleInterp(float* data, float x, int lx_data) {
int i =0;
float res = data[0];
while(data[i] <= x) {
i = i+1;
}
if (data[i] > x && i>0 && i<lx_data ) {
float p = __fdividef(data[i + lx_data] - data[i+lx_data-1],(data[i] - data[i-1]));
res = __fmaf_rn(p,x-data[i-1],data[i+lx_data-1]) ;
return res;
}
if (i == lx_data)
res = data[lx_data];
return res;
}
I am using the function feval from matlab to do the computation. I would like to optimize this code but I have not a lot of ideas. Could someone please help me on this issue? I could beat the function interp1 from Matlab only for some very huge numbers of interpolations.
I have another question regarding the way the threads are used in CUDA. Let’s say the following code which computes a Monte Carlo Simulation:
device float priceValue(float dt, float drift, float volatility,int j) {
unsigned int seed = (unsigned int) j;
curandState_t state;
curand_init(seed,0,0, &state);
float randomWalk = curand_normal(&state);
while (abs(randomWalk)> 1) {
randomWalk = curand_normal(&state);
}
return exp((drift - volatility*volatility*0.5)*dt + randomWalk*volatility*sqrt(dt));
}
global void Simulation(float* matrice, int nbreSimulation, int nPaths, float S0, float T, float drift, float volatility) {
int lx = (int)sqrt((float)sizeof(volatility)/sizeof(float));
float dt = T/nPaths;
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int index = i + j*nbreSimulation;
if (index >= nbreSimulation)
matrice[index] = matrice[index - nbreSimulation]*priceValue(dt, drift,volatility , index);
}
And the following execution code :
spmd
kernelSimulation = parallel.gpu.CUDAKernel(‘Cuda_functions.ptx’, ‘Cuda_functions.cu’, ‘Simulation’);
kernelSimulation.ThreadBlockSize = 512;
nbreSimulation = 5;
nPaths = 1000;
S0 = 100;
T = 1.5;
drift = 0.05;
volatility = 0.25; % abs(gpuArray.randn(10,2,‘single’))/10;
matrice = S0*gpuArray.ones(nbreSimulation,nPaths, ‘single’);
[A] = feval(kernelSimulation,matrice, nbreSimulation, nPaths, S0,T, drift, volatility);
traj = gather(A);
end
The result is that the computations have not been done on the whole matrix (only the 511 st values of the matrix have been changed ) because of the number of threads that I have defined. How should I proceed to use for example only 512 threads on a 5x1000 matrix?
Many thanks in advance for your help