I am converting my original C code to Cuda for performance enhancement but the values of both C and Cuda code are not matching. Cuda is giving an incorrect result. There are no errors anywhere with regard to Cuda error checking.
When I am launching the kernel the values are not matching with the original C code.
My Code:-
//**************************kernel call start********************************
for (int it = ishot * nt + 0; it < nt * (ishot + 1); it++)
{
gpuErrchk(cudaMemcpy(d_vx, vx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vz, vz[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_sxx, sxx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_szx, szx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_szz, szz[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
//so on the memcpy continues for the rest of arrays............
dim3 threadsPerBlock( 16,16);
dim3 blocksPerGrid(nz2/16+1, nx2/16+1);
kernel << < blocksPerGrid, threadsPerBlock >> > (......);
//similarly after the kernel launch
gpuErrchk(cudaMemcpy(vx[0], d_vx, size * sizeof(real_sim), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(vz[0], d_vz, size * sizeof(real_sim), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(sxx[0], d_sxx, size * sizeof(real_sim), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(szx[0], d_szx, size * sizeof(real_sim), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(szz[0], d_szz, size * sizeof(real_sim), cudaMemcpyDeviceToHost));
//so on the memcpy continues for the rest of arrays............
}//end of loop
//****************************************KERNEL starts/*************
int ix= blockIdx.x * blockDim.x + threadIdx.x;
int iz = blockIdx.y * blockDim.y + threadIdx.y;
if (ix < nx2 && ix >= nx1 && iz >= nz1 && iz < nz2)
{
vx_x = dxi * hc[1] * (vx[iz * nz2+ix] - vx[iz * nz2+(ix - 1)]);
vz_x = dxi * hc[1] * (vz[iz * nz2+(ix + 1)] - vz[iz * nz2+ix]);
vx_z = dzi * hc[1] * vx[(iz + 1) * nz2+ix] - vx[iz * nz2+ix];
vz_z = dzi * hc[1] * (vz[iz * nz2+ix] - vz[(iz - 1 )* nz2+ix]);
szx[iz*nz2+ix] += dt * mu_zx[iz * nz2 + ix] * (vz_x + vx_z)+1;
sxx[iz * nz2 + ix] += dt * (lam[iz * nz2 + ix] * (vx_x + vz_z) + (2.0 * mu[iz * nz2 + ix] * vx_x))+1;
szz[iz * nz2 + ix] += dt * (lam[iz * nz2 + ix] * (vx_x + vz_z) + (2.0 * mu[iz * nz2 + ix] * vz_z))+1;
}
//************KERNEL ENDS**************************
//***********original C++ code which I replaced with CUDA kernrl*********************************
for (int iz = nz1; iz < nz2; iz++) {
//std::cout << std::endl << "PML indices: " << std::endl;
for (int ix = nx1; ix < nx2; ix++) {
vx_x = dxi * hc[1] * (vx[iz][ix] - vx[iz][ix - 1]);
vz_x = dxi * hc[1] * (vz[iz][ix + 1] - vz[iz][ix]);
vx_z = dzi * hc[1] * (vx[iz + 1][ix] - vx[iz][ix]);
vz_z = dzi * hc[1] * (vz[iz][ix] - vz[iz - 1][ix]);
// updating stresses
szx[iz][ix] += dt * mu_zx[iz][ix] * (vz_x + vx_z);
sxx[iz][ix] += dt * (lam[iz][ix] * (vx_x + vz_z) + (2.0 * mu[iz][ix] * vx_x));
szz[iz][ix] += dt * (lam[iz][ix] * (vx_x + vz_z) + (2.0 * mu[iz][ix] * vz_z));
}
}