Hello, using single precision floating point algorithm accumulates big error after ~3000 iterations, but using double precision the same algorithm gives good results in cuda.

But using single precision same algorithm implemented in glsl/vulkan gives good results.

Here is full code: https://drive.google.com/open?id=1Q72ERuCLypzvRjAzSZ3a3bLD9k2JEGuW

```
__global__ void kernel_1(double *n_, double *m_, double *h_, double *I, double *V_YX, uint i, uint i_reg, double dt_sim, uint cellCountX, uint cellCountY) {
uint x = threadIdx.x + blockIdx.x * blockDim.x;
uint y = threadIdx.y + blockIdx.y * blockDim.y;
const double g_Na_max = 4, g_K_add = 0.002, g_L = 0, E_Na = 40, E_L = -60, Area = 7.6e-3;// mm2 3e-8*1e6,
double alpha_n = 0, beta_n = 0, alpha_m = 0, beta_m = 0, alpha_h = 0, beta_h = 0, g_Na = 0, g_K1 = 0, g_K2 = 0, i_Na = 0, i_K = 0, i_Leak = 0, i_ion = 0;
if (x < cellCountX && y < cellCountY) {
uint adr = y*cellCountX+x;
double V = V_YX[adr];
double n = n_[adr];
double m = m_[adr];
double h = h_[adr];
i_Leak = g_L*(V - E_L);
//------------potassium
alpha_n = .0003*(-V - 50) / (exp((-V - 50) / 10) - 1);
beta_n = .002*exp((-V - 90) / 80);
n = n + (alpha_n*(1 - n) - beta_n*n)*dt_sim;
g_K1 = .012*exp((-V - 90) / 50) + .00015*exp((V + 90) / 60);
g_K2 = .012*n * n * n * n;
i_K = (g_K1 + g_K2 + g_K_add)*(V + 100);
//------------ - sodium
alpha_h = .166*exp((-V - 90) / 20);
beta_h = 1 / (1 + exp((-V - 42) / 10));
h = h + (alpha_h*(1 - h) - beta_h*h)*dt_sim;
alpha_m = .1*(-V - 48) / (exp((-V - 48) / 15) - 1);
beta_m = .12*(V + 8) / (exp((V + 8) / 5) - 1);
m = m + (alpha_m*(1 - m) - beta_m*m)*dt_sim;
g_Na = m * m * m * h * g_Na_max;
i_Na = (g_Na + .0014)*(V - E_Na);
i_ion = (i_Na + i_K + i_Leak);
n_[adr] = n;
m_[adr] = m;
h_[adr] = h;
if (x > 0 && x < cellCountX-1 && y > 0 && y < cellCountY-1 ) {
double dx1 = V_YX[cellCountX*y + x - 1] - V;
double dx2 = V_YX[cellCountX*y + x + 1] - V;
double dy1 = V_YX[cellCountX*(y-1) + x] - V;
double dy2 = V_YX[cellCountX*(y+1) + x] - V;
double gjX = 125e-9f;
double gjY = 125e-9f/4.f;
I[adr] = -Area*i_ion + 1000 * ( gjX*dx1 + gjX*dx2 + gjY*dy1 + gjY * dy2 );
}
}
}
__global__ void kernel_2(double *I, double *V_YX, double *V_tYX_reg, uint i, uint i_reg, double dt_sim, double dt_reg, uint cellCountX, uint cellCountY) {
uint x = threadIdx.x + blockIdx.x * blockDim.x;
uint y = threadIdx.y + blockIdx.y * blockDim.y;
const double Cm = .12, Area = 7.6e-3;
if (x < cellCountX && y < cellCountY) {
uint adr = cellCountX*y + x;
V_tYX_reg[i_reg*cellCountX*cellCountY+adr] = V_YX[adr];
double V = V_YX[adr];
double up = 100 + 60;
double I_ext =( (i_reg > 100 && i_reg < 100 + 2) // I_ext Protokolas
|| (i_reg > up && i_reg < up + 2) )
&& (y > 32 - 2 && y < 32 + 2
&& x > 64 - 2 && x < 64 + 2 ) ? 0.08 : 0;
V_YX[adr] = V + dt_sim*(I_ext + I[adr]) / (Cm*Area);
}
}
```