Mistake in the nested loops parallelization

I am trying to parallelize this CPU code:

void dz_step(complex<double>* E, double* bess_roots, int i, int j)
{
	complex<double> I(0., 1.);
	double k_r = bess_roots[i] / Rmax;
	double w = -dw*Nt/2 + dw*j;//-2*W0 + 4*W0*j/(Nt - 1);
	
	double tmp = pow(w/(C*T0), 2) - pow(k_r/A0, 2);
	//kz(ir,it)=(j*dsqrt((w(it)*(1.d0+refr4(it)))**2-(kr(ir)/a0*c*t0)**2)-j*w(it)*vg)/(c*t0)*k0*a0**2 
	if(tmp>=0.)
		E[i*Nt+j] *= exp(	-I*K0*A0*A0*sqrt(	tmp	)	*dz + I*K0*A0*A0*dz*w/(T0*C));
	E[i*Nt + Nt - j] = conj(E[i*Nt+j]);
}


//some that I don't touch

		for(int i = 0; i < N-1; ++i)
			for(int h=Nt/2+1; h < Nt; ++h)
			{
				dz_step(&E_kw[0][0], bess_roots, i, h);
			}

GPU version of the same code:

__global__ void dz_step(thrust::complex<double>* E, const double* bess_roots)
{
	int i = threadIdx.x;
	int j = blockIdx.x+Nt/2+1;
	thrust::complex<double> I(0., 1.);
	double k_r = bess_roots[i] / Rmax;
	double w = -dw*Nt/2 + dw*j;//-2*W0 + 4*W0*j/(Nt - 1);
	
	double tmp = pow(w/(C*T0), 2) - pow(k_r/A0, 2);
	//kz(ir,it)=(j*dsqrt((w(it)*(1.d0+refr4(it)))**2-(kr(ir)/a0*c*t0)**2)-j*w(it)*vg)/(c*t0)*k0*a0**2 
	if(tmp>=0.)
		E[i*Nt+j] *= exp(	-I*K0*A0*A0*sqrt(	tmp	)	*dz + I*K0*A0*A0*dz*w/(T0*C));
	E[i*Nt + Nt - j] = conj(E[i*Nt+j]);
}

//some that I don't touch

dz_step<<<dim3(Nt/2-1, 1), dim3(N-1, 1)>>>(dev_E_kw, dev_roots);

I do not show all code because it is long, I don’t fogget copy memory. Where I made a mistake?

Thanks everyone

ps: sorry for my English

I have replaced line

E[i*Nt+j] *= exp(	-I*K0*A0*A0*sqrt(	tmp	)	*dz + I*K0*A0*A0*dz*w/(T0*C));

by

E[i*Nt+j] = -I*K0*A0*A0*sqrt(	tmp	)	*dz + I*K0*A0*A0*dz*w/(T0*C);

in GPU and CPU code. And have gotten different results. I’ve used following code for print results

for(int m = 0; m < Nt; ++m)
		{
			fprintf(fd, "%.13le\t%.13le\t%.13le\n", -dw*Nt/2 + dw*m, real(E_kw[0][m]), imag(E_kw[0][m]));
		}

Than I plot graph by 1 and 3 column.
http://pavlov.96.lt/devtalk_895517_cpu.bmp
http://pavlov.96.lt/devtalk_895517_gpu.bmp
It’s very strange. When I’ve written

E[i*Nt+j] = sqrt(tmp	);

Result on GPU was the same as CPU.

I suppose, that mistake in

-I*K0*A0*A0*sqrt(	tmp	)	*dz + I*K0*A0*A0*dz*w/(T0*C)

Where

thrust::complex<double> I(0., 1.);

double w = -dw*Nt/2 + dw*j;

#define T0 (27.e-15)
#define A0 1.
#define Zmax 1.
#define nz 1
#define dz (Zmax/nz)
#define C (3e10)
#define PI 3.14159265359
#define dw (2*PI/Tmax)

I have no idea why arithmetic operation on GPU works differently.

Also I’ve tried

E[i*Nt+j] *= exp(	-I*thrust::complex<double>(K0*A0*A0*sqrt(	tmp	)	*dz, 0.) + I*thrust::complex<double>(K0*A0*A0*dz*w/(T0*C), 0.));

it is useless.

I set -arch flag in nvcc and decide this problem.