Hi, I’m playing around with CUDA for the first time and I’m trying to replace the main loop in a subroutine with a cuda kernel. I’m not getting correct numbers so I’m wondering if someone could just check for glaring CUDA mistakes. I also know its probably written poorly but for now I’m just trying to get it working. Also, when I compile with -fast I sometimes get:
0: copyout Memcpy (host=0xe3e148, dev=0x200100000, size=4356) FAILED: 4(unspecified launch failure)
Code below. It looks a bit long but a lot of it is calculations you can probably ignore. Thanks for any help!
The host code:
subroutine grid
c source quantities are are calculated: n_i
c right now only ion quantitities are calculated...
c
use slabcuda
use cudafor
include 'slab.h'
c
integer i,j
c arrays being used in loop declared on device
real, device :: den_d(0:im,0:jm),w3_d(mm),mu_d(mm)
real, device :: x3_d(mm),rwx_d(4),y3_d(mm),rwy_d(4)
c gridDim defines geometry (& #) of blocks in the grid
c blockDim defines geometry (& #) of threads in the block
c grid block is 1D since we have threads divided over just mm
type(dim3) :: bGrid, tBlock
tBlock = dim3(256,1,1) ! 256 threads per block
bGrid = dim3(ceiling(real(mm)/tBlock%x),1,1)
c here we set the rho and den equal to zero.
do 50 i=0,im
do 60 j=0,jm
den(i,j)=0.
60 continue
50 continue
c
dv=(dx*dy)
c transfer arrays from host to device
den_d = den
w3_d = w3
mu_d = mu
x3_d = x3
rwx_d = rwx
y3_d = y3
rwy_d = rwy
c launch kernel
call gridloop<<<bGrid, tBlock>>>(den_d,w3_d,mu_d,
& x3_d,rwx_d,y3_d,rwy_d,
& mm,dv,lr,
& mims,
& lx,ly,dx,dy)
c transfer den from device back to host
den = den_d
do 300 j=0,jm
den(0,j)=( den(0,j)+den(im,j) )
den(im,j)=den(0,j)
300 continue
c
do 320 i=0,im
den(i,0)=(den(i,0)+den(i,jm))
den(i,jm)=den(i,0)
320 continue
c
do 410 i=0,im
do 420 j=0,jm
den(i,j)=q*den(i,j)/n0
420 continue
410 continue
c
return
end
The device code (module):
module slabcuda
contains
attributes(global) subroutine gridloop(den,w3,mu,x3,rwx,y3,
& rwy,
& mm,dv,lr,
& mims,
& lx,ly,dx,dy)
implicit none
real :: den(:,:),w3(:),mu(:),x3(:),rwx(:),y3(:),rwy(:)
integer :: m,l,i,j,istat
real, value :: wx0,wx1,wy0,wy1,wght,xt,yt,rhog,lx,ly,dx,dy
real, value :: dv,mims
integer, value :: mm,lr
m = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (m<=mm) then
wght=w3(m)/dv/float(lr)
rhog=sqrt(mu(m))/mims
do l=1,lr
xt=x3(m)+rwx(l)*rhog
yt=y3(m)+rwy(l)*rhog
c
if(xt.lt.0.) xt=-xt
if(xt.gt.lx) xt=2.*lx-xt
if(xt.eq.lx) xt=0.99999*lx
if(yt.ge.ly) yt=yt-ly
if(yt.le.0.) yt=yt+ly
if(yt.eq.ly) yt=0.99999*ly
i=int(xt/dx)
j=int(yt/dy)
c
wx0=float(i+1)-xt/dx
wx1=1.-wx0
wy0=float(j+1)-yt/dy
wy1=1.-wy0
c
istat = atomicadd(den(i,j), wght*wx0*wy0)
istat = atomicadd(den(i+1,j), wght*wx1*wy0)
istat = atomicadd(den(i,j+1), wght*wx0*wy1)
istat = atomicadd(den(i+1,j+1), wght*wx1*wy1)
enddo
endif
end subroutine gridloop
end module slabcuda