I have written an ode solver based on RKDP45 using PyCuda (python wrapper), but I am getting cuLaunchGrid error
on the last line of the code when I copy the calculation results back to the original memory. If I comment out the last line,
//copy the fifth order results to stv
for (ii=0;ii<3;ii++){stvl[ii] = v5[ii];}
everything works fine. If I don’t comment out the last line, but take out the entire calculation steps, that seems to work fine also.
So, it truly seems like a resource problem. (BTW, {{symbol}} in fun_dfdt are for python template engines for basic metaprogramming)
I have several questions:
- if I declared something like “int idx = threadIdx.x”, where is the variable idx stored to? My understanding is that it goes to
“Register” space, and if that got filled, then it goes to “Local” Memory space.
- What is the resource running out here?
Thanks.
__device__ void fun_dfdt(float *, float *);
__device__ float* add(float *, float , float *, float *);
__global__ void rkdp45(float *stv, float *h, float *myval)
{
//int idx = threadIdx.x;
//global error tolerance
float gbet = 0.0001; //in V of the membrane potential
//the fourth order and fifth order results
float v4[3]; float v5[3];
//vector index
int ii;
//shared memory management
extern __shared__ int sharedm[];
float *cstv = (float*)sharedm;
for (ii=0;ii<3;ii++){cstv[ii] = stv[ii];}
{
float temp[3];
float k1[3]; fun_dfdt(cstv, k1);
for (ii=0;ii<3;ii++){temp[ii] = cstv[ii]+1/5.*h[0]*k1[ii];}
float k2[3]; fun_dfdt(temp, k2);
for (ii=0;ii<3;ii++){temp[ii] = cstv[ii]+3/40.*h[0]*k1[ii]+9/40.*h[0]*k2[ii];}
float k3[3]; fun_dfdt(temp, k3);
for (ii=0;ii<3;ii++){temp[ii] = cstv[ii]+44/45.*h[0]*k1[ii]-56/15.*h[0]*k2[ii]+32/9.*h[0]*k3[ii];}
float k4[3]; fun_dfdt(temp, k4);
for (ii=0;ii<3;ii++){temp[ii] = cstv[ii]+19372/6561.*h[0]*k1[ii]-25360/2187.*h[0]*k2[ii]+64448/6561.*h[0]*k3[ii]-212/729.*h[0]*k4[ii];}
float k5[3]; fun_dfdt(temp, k5);
for (ii=0;ii<3;ii++){temp[ii] = cstv[ii]+9017/3168.*h[0]*k1[ii]-355/33.*h[0]*k2[ii]+46732/5247.*h[0]*k3[ii]+49/176.*h[0]*k4[ii]-5103/18656.*h[0]*k5[ii];}
float k6[3]; fun_dfdt(temp, k6);
for (ii=0;ii<3;ii++){temp[ii] = cstv[ii]+35/384.*h[0]*k1[ii]+500/1113.*h[0]*k3[ii]+125/192.*h[0]*k4[ii]-2187/6784.*h[0]*k5[ii]+11/84*h[0]*k6[ii];}
float k7[3]; fun_dfdt(temp, k7);
for (ii=0;ii<3;ii++){v4[ii]=5179/57600.*k1[ii]+7571/16695.*k3[ii]+393/640.*k4[ii]-92097/339200.*k5[ii]+187/2100.*k6[ii]+1/40.*k7[ii];}
for (ii=0;ii<3;ii++){v5[ii]=35/384.*k1[ii]+500/1113.*k3[ii]+125/192.*k4[ii]-2187/6784.*k5[ii]+11/84*k6[ii];}
}
//give the next time step
h[0] = __powf((gbet/(v5[0]-v4[0])),1/4)*h[0];
//copy the fifth order results to stv
for (ii=0;ii<3;ii++){stvl[ii] = v5[ii];}
}
__device__ void fun_dfdt(float *stv, float *dfdt)
{
dfdt[0] = (stv[1]-stv[2]-(stv[0]-1*{{cEl}}))/{{ctaum}};
dfdt[1] = -stv[1]/{{ctaue}};
dfdt[2] = -stv[2]/{{ctaui}};
}