Need help in accessing 4D array in CUDA kernel

I am new and trying to access 4D array in CUDA kernel but Cuda kernel output vary for each iteration.
Please suggest, where I am doing wrong… Below is my source code.

module m
contains
attributes(global) subroutine mykernel(a, d, si_x, si_z, d_map_dz, d_map_dx )
implicit none
integer,device::a(20,20,20,20)
integer,device::d(20,20,20,20)
real,device:: si_x(20,20,20,20,20)
real,device:: si_z(20,20,20,20,20)
integer,device::d_map_dz(2)
integer,device::d_map_dx(2)
integer::mz,mx,jx,ig,iz,ix,kx,iix
ix = blockDim%x*(blockIdx%x-1) + threadIdx%x
iz = blockDim%y*(blockIdx%y-1) + threadIdx%y
ig = blockDim%z*(blockIdx%z-1) + threadIdx%z
kx = 1
mz = d_map_dz(ig)
mx = d_map_dx(ig)
do jx = 1, 8
iix = ix+jx+kx
d(iix,1,iz,ig) = 5
end do
a(ix,1,iz,mx) = a(ix,1,iz,mx) + 1
a(ix,1,iz,mz) = a(ix,1,iz,mz) + 2
si_x(ix,1,iz,mx,3) = si_x(ix,1,iz,mx,3) + 1
si_z(ix,1,iz,mz,4) = si_z(ix,1,iz,mz,4) + 1
end subroutine
end module
program main
use m
use cudafor
integer,device::b(20,20,20,20)
integer,device::d(20,20,20,20)
real,device:: si_x(20,20,20,20,20)
real,device:: si_z(20,20,20,20,20)
integer::c(20,20,20,20)
integer::e(20,20,20,20)
real:: h_si_x(20,20,20,20,20)
real:: h_si_z(20,20,20,20,20)

integer, parameter :: map_dx(2) = (/2,1/)
integer, parameter :: map_dz(2) = (/1,2/)
integer,device::d_map_dz(2)
integer,device::d_map_dx(2)
type(dim3) :: grid
type(dim3) :: block
grid = dim3(2,4,4)
block = dim3(1,5,5)
c = 0
e = 0
h_psi_x = 0
h_psi_z = 0
b = c
d = e
si_x = h_si_x
si_z = h_si_z
d_map_dz = map_dz
d_map_dx = map_dx
call mykernel<<<grid,block>>>(b,d,si_x,si_z,d_map_dz,d_map_dx)
c = b
e = d
h_si_x = si_x
h_si_z = si_z
print*,c
print*,e
print*,h_si_x
print*,h_si_z
end program
Looking for solution…