hello …
i’ll be appreciate if you tell me where is my problem …
i read your article with this title of ‘’ CUDA Fortran acceleration for the finite-difference time-domain method ‘’ and implemented your algorithm but i get wrong answer … the differents between my code and yours is that i shifted phi0 matrix in 4 aspect …
There is part of my simple code that I use shared memory to make process time faster but I get wrong answer.
It seems that phi0 doesn’t shift in array(phi0(is,js+1) is equal to phi0(is,js) for my code). I used block threads in dimension of (32,32).
when i using global memory my answer is going to be correct and if i transfer phi0 to shared memory my answer is going to be wrong …
!!!
module simpleOps_m
use cudafor
contains
attributes(global) subroutine inc(phip1,phim1,phi0,coef,rho0, ds,N_z,N_rho)
implicit none
! matrix size in (N_z=2016,N_rho=6016)
real8 , device :: phim1(:,:),phi0(:,:),coef(:,:),rho0(:,:)
real8 , intent(out) :: phip1(:,:)
real*8 , shared :: phi0_s(0:33,0:33)
real*8, value :: ds
integer , value :: N_z,N_rho
integer :: i, j , is , js , k , m
is = threadIdx%x
js = threadIdx%y
i = (blockIdx%x-1)*blockDim%x + is
j = (blockIdx%y-1)*blockDim%y + js
if(i>1 .AND. i<N_z .AND. j>1 .AND. j<N_rho) then
phi0_s(is,js)=phi0(i,j)
call syncthreads()
if (is .eq. 1 ) then
phi0_s(is-1,js)=phi0(i-1,j)
else if (is .eq. 32)then
phi0_s(is+1,js)=phi0(i+1,j)
end if
if (js .eq. 1 )then
phi0_s(is,js-1)=phi0(i,j-1)
else if (js .eq. 32) then
phi0_s(is,js+1)=phi0(i,j+1)
end if
end if
call syncthreads()
if (1<i .and. i<N_z .and. 1<j .and. j<N_rho) then
phip1(i,j)=-phim1(i,j)+coef(i,j)*((-4.0d0+2.0d0/coef(i,j))phi0_s(is,js) &
+phi0_s(is-1,js) & !..%down to center
+phi0_s(is+1,js) & !..%up to center
+(1.0d0-ds/(2.0d0rho0(i,j)))phi0_s(is,js-1) & !..%left to center
+(1.0d0+ds/(2.0d0rho0(i,j)))*phi0_s(is,js+1))
end if
end subroutine inc
end module simpleOps_m
!!!
my grid and block size is :
grid = dim3(ceiling(real(nx)/32), ceiling(real(ny)/32), 1)
tBlock = dim3(32,32,1)
and i invoked function inc by this code :
call inc<<<grid,tBlock>>>(phip1_d,phim1_d,phi0_d,coef_d,rho0_d, ds,N_z,N_rho)
best regard