hello …
there is part of my simple code that i use shared memory to make process time faster but i get wrong answer … it seems that phi0 don’t shift in array … for simplicity i used one block threads in dimension of (64:64) …
i’ll be appreciate if you tell me that where is my problem ?
!!!
module simpleOps_m
use cudafor
contains
attributes(global) subroutine inc(phip1,phim1,phi0,coef,rho0, ds,N_z,N_rho)
implicit none
!my dimension is (64:64)
real*8 :: phim1(64,64),phi0(64,64),coef(64,64),rho0(64,64)
real*8 , shared :: phim1_s(64,64) , coef_s(64,64) , rho0_s(64,64),&
phip1_s(64,64),phi0_s(64,64)
real8 :: phip1(64,64)
real8, value :: ds
integer , value :: N_z,N_rho
integer :: is , js , i , j
!!! local shared array
is=threadidx%x
js=threadidx%y
phim1_s(is,js) = phim1(is,js)
coef_s(is,js) = coef(is,js)
rho0_s(is,js) = rho0(is,js)
phip1_s(is,js) = phip1(is,js)
phi0_s(is,js) = phi0(is,js)
call syncthreads()
if(is>1 .AND. is<64 .AND. js>1 .AND. js<64) then
phip1_s(is,js)=-phim1_s(is,js)+coef_s(is,js)*((-4.0d0+2.0d0/coef_s(is,js))phi0_s(is,js) &
+phi0_s(is-1,js) & !..%down to center
+phi0_s(is+1,js) & !..%up to center
+(1.0d0-ds/(2.0d0rho0_s(is,js)))phi0_s(is,js-1) & !..%left to center
+(1.0d0+ds/(2.0d0rho0_s(is,js)))*phi0_s(is,js+1))
end if
call syncthreads()
phip1(is,js) = phip1_s(is,js)
end subroutine inc
end module simpleOps_m
!!!
i import my variables and then transfer them to shared memory and then used them but i get wrong answer because phi0 don’t shift …