CUDA Fortran Program Execution Time Issue

When I run the following program it takes more time than that of simple CPU version and I can not take nsamp more than 200. Please suggest for any modification in the code so that it can run very fast on GPU.

  1. Main program (psologoXT.cuf)
    program psologoGPU
    use cudafor
    use precision_m
    use nswarm_modXT
    !use randommodGPU
    implicit none
    integer:: tpb=32,istat
    integer, parameter::nsteps=500
    integer, value ::d_nsamp
    real(kind=dp), dimension(nsamp,2):: wk0,wk,wci,wcc,wcs,x0tf
    !real(kind=dp), dimension(nsamp,2), device:: d_wk0,d_wk,d_x0tf
    real(kind=dp), dimension(nsamp,2)::xk0tf, xk0tfin
    real(kind=dp), dimension(nsamp,2),device::d_x0tf, d_xk0tfin
    real(kind=dp), dimension (2)::a,b,d,gbest,c1,c2
    real(kind=dp), dimension (2), device::d_a,d_b,d_d,d_gbest
    real(kind=dp)::gbestval
    integer::l,gbestloc,k,i1,i11
    real(kind=dp), dimension(nsamp,2),device::d_wci,d_wcc,d_wcs
    real(kind=dp), dimension(nsamp), device::d_j
    real(kind=dp), dimension(nsamp)::j,ran1,ran2,ran3,ran4,ran5,ran6
    real(kind=dp), dimension(nsamp)::jpbest,jopbest
    real(kind=dp),dimension(nsamp,2)::pbest,opbest
    real(kind=dp), device,allocatable::d_jpbest(:),d_jopbest(:)
    real(kind=dp),device,allocatable::d_pbest(:,:)
    real::temp
    real(kind=dp), dimension(nsamp,2)::ak,bk, dk
    real(kind=dp), dimension(nsamp,3)::xf
    real(kind=dp), dimension(nsamp,3), Device::d_xf
    real(kind=dp), dimension(nsamp),device::d_ak1, d_dk1
    integer:: seed,seed1
    real:: tsec
    seed1=7654321
    tsec=0.0
    !call random_seed()
    do i1=1, nsamp
    seed=seed1+2int(i1secnds(tsec))
    ran1(i1)=ran(seed)
    seed=seed1+2int(3i1secnds(tsec))
    ran2(i1)=ran(seed)
    end do
    a(1)=0.7500d0
    a(2)=2
    b(1)=0.836893d0
    b(2)=5
    d=b-a
    allocate(d_pbest(nsamp,2),d_jpbest(nsamp),d_jopbest(nsamp))
    call subintervals(a,b,nsamp,ak,bk)
    dk=bk-ak
    do l=1,nsamp
    !call random_number(ran1)
    xk0tf(l,1)=ak(l,1)+ran1(l)dk(l,1)
    pbest(l,1)=xk0tf(l,1)
    !call random_seed()
    !call random_number(ran1)
    xk0tf(l,2)=a(2)+ran2(l)d(2)
    pbest(l,2)=xk0tf(l,2)
    !write(
    ,
    ) xk0, tf0
    end do
    !$OMP End Do nowait
    !write(
    ,) xk0(l), tf0(l)
    wk=0.0
    wk0=0.0
    xk0tfin=xk0tf
    opbest=pbest
    steps:Do k=1,nsteps
    d_x0tf=pbest
    !istat=cudaMemcpy(d_x0tf, pbest, nsamp, cudaMemcpyHostToDevice)
    call swarm<<<ceiling(real(nsamp)/tpb),tpb>>>&
    (d_x0tf,d_xf,d_jpbest)
    istat=cudaMemcpy(jpbest, d_jpbest, nsamp,cudaMemcpyDeviceToHost)
    !call jupdate<<<ceiling(real(nsamp)/tpb),tpb>>>(d_xk0tf,d_xf,d_jpbest,d_nsamp)
    jopbest=jpbest
    !write(
    ,) jopbest
    call minandpos(jpbest,gbestval,gbestloc,nsamp)
    gbest(1)=pbest(gbestloc,1)
    gbest(2)=pbest(gbestloc,2)
    write(
    ,),k,gbest,gbestval
    seed=seed1+2
    int(secnds(tsec))
    !$OMP Do
    seed=seed1i1+2int(secnds(tsec))
    !call random_seed()
    do i1=1, nsamp
    seed=seed1+2int(i1secnds(tsec))
    ran1(i1)=ran(seed)
    seed=seed1+2int(2i1secnds(tsec))
    ran2(i1)=ran(seed)
    seed=seed1+2
    int(3i1secnds(tsec))
    ran3(i1)=ran(seed)
    seed=seed1+2int(4i1secnds(tsec))
    ran4(i1)=ran(seed)
    seed=seed1+2
    int(5i1secnds(tsec))
    ran5(i1)=ran(seed)
    seed=seed1+2int(6i1*secnds(tsec))
    ran6(i1)=ran(seed)
    end do

Do i1=1,nsamp
!write(,),i1, ran1(i1)
wci(i1,1)=(1.d0+ran1(i1))/2.
!write(,),i1, ran2(i1)
wci(i1,2)=(1.d0+ran2(i1))/2.
!write(,),i1, ran3(i1)
wcc(i1,1)=1.49445d0ran3(i1)
!write(
,),i1, ran5(i1)
wcc(i1,2)=1.49445d0
ran4(i1)
!write(,),i1, ran4(i1)
wcs(i1,1)=1.49445d0ran5(i1)
!write(
,),i1, ran6(i1)
wcs(i1,2)=1.49445d0
ran6(i1)

wk(i1,1)=wci(i1,1)wk0(i1,1)+wcc(i1,1)(pbest(i1,1)-xk0tfin(i1,1))&
+wcs(i1,1)*(gbest(1)-xk0tfin(i1,1))

wk(i1,2)=wci(i1,2)wk0(i1,2)+wcc(i1,2)(pbest(i1,2)-xk0tfin(i1,2))&
+wcs(i1,2)*(gbest(2)-xk0tfin(i1,2))

if (wk(i1,1)<-d(1)) then
wk(i1,1)=-d(1)
end if
if (wk(i1,1)>d(1)) then
wk(i1,1)=d(1)
end if

xk0tf(i1,1)=xk0tfin(i1,1)+wk(i1,1)

if (xk0tf(i1,1)<a(1)) then
xk0tf(i1,1)=a(1)
wk(i1,1)=0.0
end if
if (xk0tf(i1,1)>b(1)) then
xk0tf(i1,1)=b(1)
wk(i1,1)=0.0
end if
if (wk(i1,2)<-d(2)) then
wk(i1,2)=-d(2)
end if
if (wk(i1,2)>d(2)) then
wk(i1,2)=d(2)
end if
xk0tf(i1,2)=xk0tfin(i1,2)+wk(i1,2)
if (xk0tf(i1,2)<a(2)) then
xk0tf(i1,2)=a(2)
wk(i1,2)=0.0
end if
if (xk0tf(i1,2)>b(2)) then
xk0tf(i1,2)=b(2)
wk(i1,2)=0.0
end if
end do
!write(,)xk0tf
d_x0tf=xk0tf

!istat=cudaMemcpy(d_x0tf, xk0tf, nsamp)
call swarm<<<ceiling(real(nsamp)/tpb),tpb>>>(d_x0tf,d_xf,d_jpbest)
istat=cudaMemcpy(jpbest, d_jpbest, nsamp,cudaMemcpyDeviceToHost)
!jpbest=d_jpbest
!write(,) xk0tf
Do i1=1,nsamp
if(jopbest(i1) .lt. jpbest(i1)) then
pbest(i1,1)=opbest(i1,1)
pbest(i1,2)=opbest(i1,2)
opbest(i1,1)=pbest(i1,1)
opbest(i1,2)=pbest(i1,2)
else
pbest(i1,1)=xk0tf(i1,1)
pbest(i1,2)=xk0tf(i1,2)
opbest(i1,1)=pbest(i1,1)
opbest(i1,2)=pbest(i1,2)
end if
end DO
wk0=wk
xk0tfin=xk0tf
opbest=pbest
end do steps
deallocate(tfinal,d_pbest,d_jpbest,d_jopbest)
end program psologoGPU

subroutine subintervals(a1,b1,n1,ak1,bk1)
use precision_m
use nswarm_modXT
implicit none
integer, intent(in)::n1
real(kind=dp), intent(in),dimension(2)::a1,b1
real(kind=dp), intent(out),dimension(n1,2)::ak1,bk1
integer::i
real(kind=dp), dimension(2)::h
h(1)=real((b1(1)-a1(1))/n1)
h(2)=real((b1(2)-a1(2))/n1)
ak1(1,1)=a1(1)
ak1(1,2)=a1(2)
bk1(n1,1)=b1(1)
bk1(n1,2)=b1(2)
Do i=1,n1-1
ak1(i+1,1)=ak1(i,1)+h(1)
ak1(i+1,2)=ak1(i,2)+h(2)
bk1(i,1)=ak1(i+1,1)
bk1(i,2)=ak1(i+1,2)
end do
end subroutine subintervals


subroutine minandpos(tab,mini,mpos,dim1)
implicit none
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(15)
integer, intent(in)::dim1
real(kind=dp), intent(out)::mini
integer, intent(out)::mpos
real(kind=dp), intent(in), dimension(dim1)::tab
integer::m
mini=tab(1)
Do m=2,dim1
if(mini>tab(m)) then
mini=tab(m)
end if
end do
Do m=1,dim1
if(mini==tab(m)) then
mpos=m
end if
end do
end subroutine minandpos

  1. supporting modules:

(a) nswarm_modXT:

module nswarm_modXT
use precision_m
use curand_device
implicit none
!INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(10)
integer, parameter::nsamp=32
integer, parameter:: nvar1=3,neq1=3
real(kind=dp), parameter,value::jacabiC=3.0d0! Jacobi constant for L1
integer,parameter, value::p=5,pcap=4
real,device,allocatable::tfinal(:)
!real(kind=dp),dimension(7)::alpha=(/0.0, 1.0/5.0,3.0/10.0,4.0/5.0,8.0/9.0,1.0,1.0 /)
contains

attributes(global) subroutine swarm(dx0tf,dxf,djpbest)
implicit none
real(kind=dp), dimension(3):: x,x0,xcap,xk0
real(kind=dp), intent(in),dimension(nsamp,2)::dx0tf
real(kind=dp), intent(out),dimension(nsamp)::djpbest
real(kind=dp), value::dt,t0,t,tmp,etol,&
hopt,hnew,hmin,hmax,facmax,facmin,fac,err2
!4.334959
real(kind=dp):: pi=3.141592653589793238462d0
real(kind=dp), intent(out),dimension(nsamp,3):: dxf
real(kind=dp), dimension(3):: atol, rtol
integer:: i,i1,q
allocate(tfinal(nsamp))
atol(1:3)=1.0E-1610.**1/3.
rtol(1:3)= atol
10.1/3.
etol=1.0E-16
dxf(nsamp,3)=0
q=min(p,pcap)
i1=(blockIdx%x-1)*blockDim%x+threadIdx%x
if(i1<=nsamp) then
!err2=0.0
!jdo:Do i1=1,nsamp1
dt=0.00001D0
t0=0
t=0
facmax=3.d0
facmin=1.6d0
fac=.84
!fac(i1)=(0.25D0)
(1.0d0/(real(q)+1.0D0))
x0(1)=dx0tf(i1,1)
!0.768714
x0(2)=0.0d0
x0(3)=pi/2.
tfinal(i1)=dx0tf(i1,2)
!tfinal=4.334959d0
rk45do:Do
if(t .eq. 0) then
call hctrl(x0,x,xcap,atol,rtol,t,dt,err2)
call klmn(x0,x,xcap,dt)
t=t+dt
hmin=dt/64.D0
hmax=64.d0dt
!write(
,*) t, x
x0(1)=x(1)
x0(2)=x(2)
x0(3)=x(3)

!d_xf=xfdv
else
call hctrl(x0,x,xcap,atol,rtol,t,dt,err2)
!write(,) err2
hnew=dtmin(hmax,max(hmin,&
fac
(etol/err2)**(1.0/(real(q)+1.0))))
if(err2 .le. etol .and. facmax .le. 5.0) then
!write(,) 'Accepted steps ',dt
call klmn(x0,x,xcap,dt)
t=t+dt
tmp=t
x0(1)=x(1)
x0(2)=x(2)
x0(3)=x(3)
dxf(i1,1)=x0(1)
dxf(i1,2)=x0(2)
dxf(i1,3)=x0(3)
!write(,) t, x
facmax=facmax+.2D0
if (err2 .eq. 0.0) then
hopt=0.0
else
hopt=hnew
end if
if (hopt.lt. 0.75d0) then
dt=max(hopt4.d0,64.d0hmin)
else if (hopt .gt. 1.5d0) then
dt=min(4.d0hopt,1.6d0)
end if
if (dt .gt. hmax) then
dt=dt/2.
else if(tmp .gt. tfinal(i1)) then
dt=tfinal(i1)-dt
if(dt>hmax) exit
end if
else
call klmn(x0,x,xcap,dt)
dt=hnew
facmax=1.d0
!write(
,*) 'rejected steps ',dt
if(t>tfinal(i1)) exit
end if
end if

end do rk45do
call syncthreads()
!write(,) ‘Aceepted steps=’,nac, 'rejected steps= ',nrej
!end Do jdo
djpbest(i1)=abs(dxf(i1,1)-dx0tf(i1,1))+abs(dxf(i1,2))&
+abs(min(mod(dxf(i1,3)-pi/2.0,2.0pi),mod(-(dxf(i1,3)-pi/2.0),2.0pi)))
end if
!call syncthreads()
!call jupdate(d_x0tf,d_xf,d_jpbest)
call syncthreads()
end subroutine swarm



attributes(device) subroutine jupdate(dx0tf,dxf,djpbest)
implicit none
real(kind=dp), intent(out), dimension(nsamp)::djpbest
real(kind=dp),intent(in), dimension(nsamp,3)::dxf
real(kind=dp),intent(in), dimension(nsamp,2):: dx0tf
real(kind=dp):: pi=3.141592653589793238462d0
integer::i1
i1=(blockIdx%x-1)blockDim%x+threadIdx%x
if(i1<=nsamp) then
djpbest(i1)=abs(dxf(i1,1)-dx0tf(i1,1))+abs(dxf(i1,2))&
+abs(min(mod(dxf(i1,3)-pi/2.0,2.0
pi),mod(-(dxf(i1,3)-pi/2.0),2.0pi)))
!write(
,*) xf
end if
call syncthreads()
end subroutine jupdate

attributes(device) subroutine &
hctrl(x01,x1,x1cap,atol1,rtol1,t01,dt0,err1)
implicit none
real(kind=dp), intent(in), dimension(nvar1):: atol1,rtol1
real(kind=dp):: d0,d1,d2,h0,h1,dsum=0.0,d1sum=0.0,t01
real(kind=dp),intent(inout), dimension(nvar1)::x01,x1,x1cap
real(kind=dp), dimension(nvar1)::f=0.0,f0, f1,sc
real(kind=dp),intent(out)::dt0,err1
!fac=0.8,0.9, (0.25)(1.0/(q+1.0)), or (0.38)(1.0/(q+1.0))
integer::q,i
q=min(p,pcap)
call eqm(x01,f)
if (t01 .eq. 0.0) then
f0=f
Do i=1,nvar1
sc(i)=atol1(i)+abs(x01(i))rtol1(i)
dsum=dsum+(x01(i)/sc(i))**2
d1sum=d1sum+(f0(i)/sc(i))*2
end do
d0=sqrt(dsum/nvar1)
d1=sqrt(d1sum/nvar1)
dsum=0.0
d1sum=0.0
if(d1 .lt. 1.0E-6 .or. d0 .lt. 1.0E-6) then
h0=1.0E-6
else
h0=0.01
(d0/d1)
end if
x01=x01+h0
f0
call eqm(x01,f)
f1=f
Do i=1,nvar1
sc(i)=atol1(i)+max(abs(f1(i)),abs(f0(i)))rtol1(i)
dsum=dsum+((f1(i)-f0(i))/sc(i))**2
end do
d2=sqrt(dsum/nvar1)/h0
if(max(d1,d2).le. 1.0E-6 ) then
h1=max(1.0d-6,h0
1.0d-3)
else
h1=(0.01d0/max(d1,d2))**(1.0/(real(q)+1.0))
end if
dt0=min(100.0d0h0,h1)
!write(
,) dt0
!call klmn(x01,x1,x1cap,para1,dt0)
else
err1=0.0D0
!call klmn(x01,x1,x1cap,para1,dt0)
Do i=1,nvar1
sc(i)=atol1(i)+max(abs(x01(i)), abs(x1cap(i)))rtol1(i)
err1=err1+((x1(i)-x1cap(i))/sc(i))**2
end do
err1=sqrt(err1/nvar1)
!write(
,
) err1
end if
end subroutine hctrl

attributes(device) subroutine klmn(y0,y1,y1cap,dtt)
implicit none
real(kind=dp), intent(in)::dtt
real(kind=dp),intent(in),dimension(3)::y0
real(kind=dp),intent(out),dimension(3)::y1,y1cap
real(kind=dp), dimension(3)::f1,sum1,sum2
real(kind=dp),dimension(7,3)::k
!real(kind=dp),dimension(3),shared::k1,k2,k3,k4,k5,k6,k7
real(kind=dp), dimension(7)::ccap=(/ 5179.0/57600.0,0.0,7571.0/16695.0,&
393.0/640.0,-92097.0/339200.0,187.0/2100.0,1.0/40.0 /)
real(kind=dp),dimension(7)::crk=(/35.0/384.0,0.0,500.0/1113.0,&
125.0/192.0,-2187.0/6784.0,11.0/84.0, 0.0/)
real(kind=dp), dimension(7,7)::b
integer::i,j
!allocate(b(7,7))
b(1,1:7)=0.0
b(2,2:7)=0.0
b(3,3:7)=0.0
b(4,4:7)=0.0
b(5,5:7)=0.0
b(6,6:7)=0.0
b(7,7)=0.0
b(2,1)=1.0/5.0
b(3,1)=3.0/40.0
b(4,1)=44.0/45.0
b(5,1)=19372.0/6561.0
b(6,1)=9017.0/3168.0
b(7,1)=35.0/384.0
b(3,2)=9.0/40.0
b(4,2)=-56.0/15.0
b(5,2)=-25360.0/2187.0
b(6,2)=-355.0/33.0
b(7,2)=0.0
b(4,3)=32.0/9.0
b(5,3)=64448.0/6561.0
b(6,3)=46732.0/5247.0
b(7,3)=500.0/1113.0
b(5,4)=-212.0/729.0
b(6,4)=49.0/176.0
b(7,4)=125.0/192.0
b(6,5)=-5103.0/18656.0
b(7,5)=-2187.0/6784.0
b(7,6)=11.0/84.0

y1=y0
!write(,) beta(2,2)
Do i=2,7
sum1=0
call eqm(y1,f1)
k(i-1,1)=f1(1)
k(i-1,2)=f1(2)
k(i-1,3)=f1(3)
Do,j=1,i-1
sum1(1)=sum1(1)+b(i,j)*k(j,1)
sum1(2)=sum1(2)+b(i,j)*k(j,2)
sum1(3)=sum1(3)+b(i,j)k(j,3)
End Do
y1=y0+dtt
sum1
End Do

call eqm(y0,f1)
k(7,:)=f1
Do,i=1,7
sum1=sum1+crk(i)k(i,:)
sum2=sum2+ccap(i)k(i,:)
End Do
y1=y0+dtt
sum1
y1cap=y0+dtt
sum2
!write(,) y1-y1cap
end subroutine klmn

attributes(device) subroutine eqm(var,f)
implicit none
real(kind=dp), intent(in),dimension(3), device ::var
real(kind=dp),intent(out),dimension(3), device ::f
real(kind=dp), shared:: mu=0.01215510d0
real(kind=dp), device ::r1,r2,omx,omy,om
r1=sqrt((var(1)+mu)2+var(2)2)
r2=sqrt((var(1)+mu-1.d0)2+var(2)2)
omx=var(1)-((1.d0-mu)*(var(1)+mu))/r1
3.-((mu)*(var(1)+mu-1.d0))/r2
3.
omy=var(2)-(1.d0-mu)*var(2)/r1
3-(mu)*var(2)/r2
3
om=(var(1)**2+var(2)**2)/2.d0+(1.d0-mu)/r1+mu/r2
f(1)=cos(var(3))sqrt(2.d0om-jacabiC)
f(2)=sin(var(3))sqrt(2.d0om-jacabiC)
f(3)=(omycos(var(3))-omxsin(var(3)))/sqrt(2.d0*om-jacabiC)-2.d0
end subroutine eqm
end module nswarm_modXT

2(b) Module for precision_m:

module precision_m
integer, parameter::singlePrecision=kind(0.0)
integer, parameter::doublePrecision=kind(0.0d0)
!integer, parameter::dp=singlePrecision
integer, parameter::dp=doublePrecision
end module precision_m