hello …
this is my code that i bring it below … my problem is that when i transfer phi0 matrix from global to shared memory i’ll get correct answer but the time of processing is going to increased that is not what i’m expect because as we know the time for accessing for shared memory is less than global memory …
i’ll be appreciate that if you tell me why and where is my problem …
this is my module code :
module simpleOps_m
use cudafor
contains
attributes(global) subroutine inc(phip1,phim1,phi0,coef,rho0, ds,N_z,N_rho)
implicit none
! matrix size in (N_z=2001,N_rho=6001)
real*8 :: phim1(N_z,N_rho),phi0(N_z,N_rho),coef(N_z,N_rho),rho0(N_z,N_rho)
real*8 :: phip1(N_z,N_rho)
real*8 , shared :: phi0_s(0:33,0:33)
real*8, value :: ds
integer , value :: N_z,N_rho
integer :: i, j , is , js
is = threadIdx%x
js = threadIdx%y
i = (blockIdx%x-1)*blockDim%x + is
j = (blockIdx%y-1)*blockDim%y + js
if(i.ge.1 .AND. i.le.N_z .AND. j.ge.1 .AND. j.le.N_rho) then
phi0_s(is,js)=phi0(i,j)
endif
if (is .eq.1 .and. i.gt.1) then
phi0_s(is-1,js)=phi0(i-1,j)
else if (is .eq. 32 .and. i.lt.N_z) then
phi0_s(is+1,js)=phi0(i+1,j)
end if
if (js .eq. 1 .and. j.gt.1)then
phi0_s(is,js-1)=phi0(i,j-1)
else if (js .eq. 32 .and. j.lt.N_rho) then
phi0_s(is,js+1)=phi0(i,j+1)
end if
if (1<i .and. i<N_z .and. 1<j .and. j<N_rho) then
phip1(i,j)=-phim1(i,j)+coef(i,j)*((-4.0d0+2.0d0/coef(i,j))*phi0_s(is,js) &
+phi0_s(is-1,js) & !...%down to center
+phi0_s(is+1,js) & !...%up to center
+(1.0d0-ds/(2.0d0*rho0(i,j)))*phi0_s(is,js-1) & !...%left to center
+(1.0d0+ds/(2.0d0*rho0(i,j)))*phi0_s(is,js+1))
end if
return
end subroutine inc
end module simpleOps_m
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
and i invoked it by this code :
call inc<<<grid,tBlock>>>(phip1_d,phim1_d,phi0_d,coef_d,rho0_d, ds,N_z,N_rho)
and my grid and block size is :
grid = dim3(ceiling(real(N_z)/32), ceiling(real(N_rho)/32), 1)
tBlock = dim3(32,32,1)
best regard @@MIRI@@