I am trying to parallelize this CPU code:
void dz_step(complex<double>* E, double* bess_roots, int i, int j)
{
complex<double> I(0., 1.);
double k_r = bess_roots[i] / Rmax;
double w = -dw*Nt/2 + dw*j;//-2*W0 + 4*W0*j/(Nt - 1);
double tmp = pow(w/(C*T0), 2) - pow(k_r/A0, 2);
//kz(ir,it)=(j*dsqrt((w(it)*(1.d0+refr4(it)))**2-(kr(ir)/a0*c*t0)**2)-j*w(it)*vg)/(c*t0)*k0*a0**2
if(tmp>=0.)
E[i*Nt+j] *= exp( -I*K0*A0*A0*sqrt( tmp ) *dz + I*K0*A0*A0*dz*w/(T0*C));
E[i*Nt + Nt - j] = conj(E[i*Nt+j]);
}
//some that I don't touch
for(int i = 0; i < N-1; ++i)
for(int h=Nt/2+1; h < Nt; ++h)
{
dz_step(&E_kw[0][0], bess_roots, i, h);
}
GPU version of the same code:
__global__ void dz_step(thrust::complex<double>* E, const double* bess_roots)
{
int i = threadIdx.x;
int j = blockIdx.x+Nt/2+1;
thrust::complex<double> I(0., 1.);
double k_r = bess_roots[i] / Rmax;
double w = -dw*Nt/2 + dw*j;//-2*W0 + 4*W0*j/(Nt - 1);
double tmp = pow(w/(C*T0), 2) - pow(k_r/A0, 2);
//kz(ir,it)=(j*dsqrt((w(it)*(1.d0+refr4(it)))**2-(kr(ir)/a0*c*t0)**2)-j*w(it)*vg)/(c*t0)*k0*a0**2
if(tmp>=0.)
E[i*Nt+j] *= exp( -I*K0*A0*A0*sqrt( tmp ) *dz + I*K0*A0*A0*dz*w/(T0*C));
E[i*Nt + Nt - j] = conj(E[i*Nt+j]);
}
//some that I don't touch
dz_step<<<dim3(Nt/2-1, 1), dim3(N-1, 1)>>>(dev_E_kw, dev_roots);
I do not show all code because it is long, I don’t fogget copy memory. Where I made a mistake?
Thanks everyone
ps: sorry for my English