Thank you for the fast reply.

Here the code, anyway I don’t have a programming background so the code may look ugly.

Well… without any explanation the code could be difficult to understand.

Let’s say that the polynomial that I solve is monotonic with the increase of time, so it

increase at each timestep.

“Prob_of_t” also increase depending on the solution of the aforementioned polynomial, so

the condition to end the loop should be verified.

In my computer it works until timestep=(0.5e-12)/vel, but it fail when timestep=(0.1e-12)/vel.

```
#include <cuda.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <cmath>
#include <curand_kernel.h>
using namespace std;
__global__ void setup_kernel(curandState *state, int nummod)
{
const int id=blockIdx.x*nummod;
curand_init(id, id, 0, &state[id]);
}
// our function is a third grade equation U^3 + bU^2 + cU + d = f(x)
__device__ float f(float b, float c, float d, float x) {
return d + x*(c + x*(b + x) );
}
// first derivative of the function 3U^2 + 2bU + c
__device__ float fprime(float b, float c, float x) {
return c + x*(2*b + 3*x);
}
// it calculate the solutions for the polinomial
__device__ bool nr_pol3(float b, float c, float d, float x0, float epsilon, float& x) {
const int maxIteration = 500;
x = x0;
float fx = f(b,c,d,x);
int numIterations = 0;
while (fabs(fx) > epsilon) {
float fpx = fprime(b,c,x);
++numIterations;
if (numIterations>maxIteration || fabs(fpx)<epsilon) {
return false;
}
x -= fx / fpx;
fx = f(b,c,d,x);
}
//cout<< "Number of iterations:" << numIterations <<endl;
return true;
}
// here we simply calculate the coefficient a,b,c of the equation U^3 +aU^2 +bU +c =0
//from the parameters
__device__ void coeff(float v,float time,float temp,float kc,float cl,float pl,float *sol)
{
float kbol, s, vt, SL, den, svt, b, c, d;
kbol=1.3806e-23;
s=kc*pl/(kbol*temp);
vt=v*time;
SL=s*cl;
den=-4*SL-4;
svt=s*vt;
b=(9.0+4.0*svt+8.0*SL)/den;
c=(-6.0-8.0*svt-4.0*SL)/den;
d=4.0*svt/den;
sol[0]=b;
sol[1]=c;
sol[2]=d;
}
__device__ float getFt(float a,float v,float time,float cl,float kc)
{return (v*time-a*cl)*kc;}
__device__ float prob_of_t(float koff,float deltax,float temp,float timestep,float mod,float F){
float kbol=1.3806e-23;
return mod*timestep*koff*exp(F*deltax/(kbol*temp));
}
__global__ void sim_unf(float vel, float kel, float modclength, float plength, float deltax, float koff, float temp, const int nummod, float *dev_a, curandState *state)
{
const int Idx=blockIdx.x*nummod;
//here we assigne the inizial contour length given by the length of the FOLDED molecule
float firstclength, clength, timestep, sim_time, phys_sol_U, abc[3];// unfolding_max_forces[nummod];
firstclength=0.5*(modclength);
clength=firstclength;
timestep=(0.5e-11)/vel;
sim_time=0.0;
phys_sol_U=0.0;
curandState localState = state[Idx];
float last_ft;
int numbmod=nummod;
while (numbmod>0)
{
//find the coefficients a,b,c of the equation u^3 +aU^2+bU+c=0 that are function of the parameters
coeff(vel,sim_time,temp,kel,clength,plength,abc);
//find the solutions with Newton solver
nr_pol3(abc[0],abc[1],abc[2],phys_sol_U,1e-6,phys_sol_U);
last_ft= getFt(phys_sol_U,vel,sim_time,clength,kel);
if ( curand_uniform(&localState) <=prob_of_t(koff,deltax,temp,timestep,numbmod,last_ft))
{
clength+=modclength;
numbmod-=1;
dev_a[Idx+numbmod]=last_ft;
}
sim_time+=timestep;
}
//no idea why ppl do that...
state[Idx]=localState;
}
int main(){
clock_t t1, t2;
t1=clock();
const int num_modules=8, num_curves=512;
const int numtotal=num_modules*num_curves;
float *unf_max_forces = (float*)malloc(numtotal*sizeof(float));
float *dev_a;
float vel=0.3e-6; //it is in m/s
float kel=0.05; //it is in N/m
float modclength=28e-9; //it is in m
float plength=0.3e-9; //it is in m
float temp=300.0; //it is in K
float dxinit=1.5e-10; //it is in m
float koff=0.37; //it is in s^-1
curandState *devStates;
cudaMalloc( (void **)&devStates, num_curves*sizeof(curandState) );
cudaMalloc ( (void **)&dev_a, numtotal*sizeof(float));
/* Setup prng states */
setup_kernel<<<num_curves, 1>>>(devStates,num_modules);
sim_unf<<<num_curves,1>>>(vel, kel, modclength, plength, dxinit, koff, temp, num_modules, dev_a, devStates);
cudaMemcpy(unf_max_forces,dev_a,numtotal*sizeof(float),cudaMemcpyDeviceToHost);
t2=clock();
float diff = ((float)t2 - (float)t1)/(float)CLOCKS_PER_SEC;
cout<<"Execution time for: "<<num_curves<<" curves: "<<diff<<" seconds"<<endl;
ofstream file;
file.open("dataout.txt");
for (int i=0; i< num_curves*num_modules;++i){
int num_semicol=i % num_modules;
for (int k=0;k<num_semicol;++k)
file<<";";
file<<1e12*unf_max_forces[i]<<endl;
}
file.close();
}
```