Cuda giving wrong result

I am converting my original C code to Cuda for performance enhancement but the values of both C and Cuda code are not matching. Cuda is giving an incorrect result. There are no errors anywhere with regard to Cuda error checking.
When I am launching the kernel the values are not matching with the original C code.
My Code:-

        //**************************kernel call start********************************
              
            for (int it = ishot * nt + 0; it < nt * (ishot + 1); it++)
            {
                gpuErrchk(cudaMemcpy(d_vx, vx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
                gpuErrchk(cudaMemcpy(d_vz, vz[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
                gpuErrchk(cudaMemcpy(d_sxx, sxx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
                gpuErrchk(cudaMemcpy(d_szx, szx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
                gpuErrchk(cudaMemcpy(d_szz, szz[0], size * sizeof(real_sim), cudaMemcpyHostToDevice));
                  //so on the memcpy continues for the rest of arrays............
             dim3 threadsPerBlock( 16,16);
             dim3 blocksPerGrid(nz2/16+1, nx2/16+1);
                kernel << < blocksPerGrid, threadsPerBlock >> > (......);
                    
           //similarly after the kernel launch
                  
                gpuErrchk(cudaMemcpy(vx[0], d_vx, size * sizeof(real_sim), cudaMemcpyDeviceToHost));
                gpuErrchk(cudaMemcpy(vz[0], d_vz, size * sizeof(real_sim), cudaMemcpyDeviceToHost));
                  gpuErrchk(cudaMemcpy(sxx[0], d_sxx,  size * sizeof(real_sim), cudaMemcpyDeviceToHost));
                  gpuErrchk(cudaMemcpy(szx[0], d_szx,  size * sizeof(real_sim), cudaMemcpyDeviceToHost));
                  gpuErrchk(cudaMemcpy(szz[0], d_szz,  size * sizeof(real_sim), cudaMemcpyDeviceToHost));
                   //so on the memcpy continues for the rest of arrays............
            }//end of loop

        //****************************************KERNEL starts/*************
  
        int ix= blockIdx.x * blockDim.x + threadIdx.x;
         int iz = blockIdx.y * blockDim.y + threadIdx.y;
            if (ix < nx2 && ix >= nx1 && iz >= nz1 && iz < nz2)
            {
             
            vx_x = dxi * hc[1] * (vx[iz * nz2+ix] - vx[iz * nz2+(ix - 1)]);
            vz_x = dxi * hc[1] * (vz[iz * nz2+(ix + 1)] - vz[iz * nz2+ix]);
            vx_z = dzi * hc[1] * vx[(iz + 1) * nz2+ix] - vx[iz * nz2+ix];
            vz_z = dzi * hc[1] * (vz[iz * nz2+ix] - vz[(iz - 1 )* nz2+ix]);
          
   
            szx[iz*nz2+ix] += dt * mu_zx[iz * nz2 + ix] * (vz_x + vx_z)+1;
            sxx[iz * nz2 + ix] += dt * (lam[iz * nz2 + ix] * (vx_x + vz_z) + (2.0 * mu[iz * nz2 + ix] * vx_x))+1;
            szz[iz * nz2 + ix] += dt * (lam[iz * nz2 + ix] * (vx_x + vz_z) + (2.0 * mu[iz * nz2 + ix] * vz_z))+1;

                   }

        //************KERNEL  ENDS************************** 

        //***********original C++ code which I replaced with CUDA kernrl*********************************
          for (int iz = nz1; iz < nz2; iz++) {
                        //std::cout << std::endl << "PML indices: " << std::endl;
                        for (int ix = nx1; ix < nx2; ix++) {
                          
                            vx_x = dxi * hc[1] * (vx[iz][ix] - vx[iz][ix - 1]);
                            vz_x = dxi * hc[1] * (vz[iz][ix + 1] - vz[iz][ix]);
                            vx_z = dzi * hc[1] * (vx[iz + 1][ix] - vx[iz][ix]);
                            vz_z = dzi * hc[1] * (vz[iz][ix] - vz[iz - 1][ix]);
                          
                                
                            // updating stresses
                          szx[iz][ix] += dt * mu_zx[iz][ix] * (vz_x + vx_z);
                          sxx[iz][ix] += dt * (lam[iz][ix] * (vx_x + vz_z) + (2.0 * mu[iz][ix] * vx_x));
                            szz[iz][ix] += dt * (lam[iz][ix] * (vx_x + vz_z) + (2.0 * mu[iz][ix] * vz_z));
                       
                        }
                    }

>>There are no errors anywhere with regard to Cuda error checking.
You need to have explicit error handling checks in the host. Every cuda routine invoked should return cudaError_t error, a simple check “error != cudaSuccess” and print statements should help understand/raise the device errors at host end.

>>cudaMemcpy(d_vx, vx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice)
cudaMemcpy funtion call expects dst and src pointers, but above the src element is passed instead of pointer. Please update it to either &vx[0] or vx.

For further debug, you can add prints in Kernel as below,
if( threadIdx.x ==0 && blockIdx.x ==0)
printf(“Debug Params: . . .”)

>>similary after the kernel launch
cudaDeviceSynchronize() should be invoked before D2H data transfer. I hope you have this in the complete code.

I have allocated my memory like this.

pDouble = new real_sim * [dim1];
pDouble[0] = new real_sim[dim1 * dim2];
for (int i = 1; i < dim1; i++) pDouble[i] = pDouble[i - 1] + dim2;

That is why I have used
>>cudaMemcpy(d_vx, vx[0], size * sizeof(real_sim), cudaMemcpyHostToDevice)
cudaDeviceSynchronize() is there in my complete code
Please look at my kernel code and the actual C code down below and let me know if you find anything…