When I run the following program it takes more time than that of simple CPU version and I can not take nsamp more than 200. Please suggest for any modification in the code so that it can run very fast on GPU.
- Main program (psologoXT.cuf)
program psologoGPU
use cudafor
use precision_m
use nswarm_modXT
!use randommodGPU
implicit none
integer:: tpb=32,istat
integer, parameter::nsteps=500
integer, value ::d_nsamp
real(kind=dp), dimension(nsamp,2):: wk0,wk,wci,wcc,wcs,x0tf
!real(kind=dp), dimension(nsamp,2), device:: d_wk0,d_wk,d_x0tf
real(kind=dp), dimension(nsamp,2)::xk0tf, xk0tfin
real(kind=dp), dimension(nsamp,2),device::d_x0tf, d_xk0tfin
real(kind=dp), dimension (2)::a,b,d,gbest,c1,c2
real(kind=dp), dimension (2), device::d_a,d_b,d_d,d_gbest
real(kind=dp)::gbestval
integer::l,gbestloc,k,i1,i11
real(kind=dp), dimension(nsamp,2),device::d_wci,d_wcc,d_wcs
real(kind=dp), dimension(nsamp), device::d_j
real(kind=dp), dimension(nsamp)::j,ran1,ran2,ran3,ran4,ran5,ran6
real(kind=dp), dimension(nsamp)::jpbest,jopbest
real(kind=dp),dimension(nsamp,2)::pbest,opbest
real(kind=dp), device,allocatable::d_jpbest(:),d_jopbest(:)
real(kind=dp),device,allocatable::d_pbest(:,:)
real::temp
real(kind=dp), dimension(nsamp,2)::ak,bk, dk
real(kind=dp), dimension(nsamp,3)::xf
real(kind=dp), dimension(nsamp,3), Device::d_xf
real(kind=dp), dimension(nsamp),device::d_ak1, d_dk1
integer:: seed,seed1
real:: tsec
seed1=7654321
tsec=0.0
!call random_seed()
do i1=1, nsamp
seed=seed1+2int(i1secnds(tsec))
ran1(i1)=ran(seed)
seed=seed1+2int(3i1secnds(tsec))
ran2(i1)=ran(seed)
end do
a(1)=0.7500d0
a(2)=2
b(1)=0.836893d0
b(2)=5
d=b-a
allocate(d_pbest(nsamp,2),d_jpbest(nsamp),d_jopbest(nsamp))
call subintervals(a,b,nsamp,ak,bk)
dk=bk-ak
do l=1,nsamp
!call random_number(ran1)
xk0tf(l,1)=ak(l,1)+ran1(l)dk(l,1)
pbest(l,1)=xk0tf(l,1)
!call random_seed()
!call random_number(ran1)
xk0tf(l,2)=a(2)+ran2(l)d(2)
pbest(l,2)=xk0tf(l,2)
!write(,) xk0, tf0
end do
!$OMP End Do nowait
!write(,) xk0(l), tf0(l)
wk=0.0
wk0=0.0
xk0tfin=xk0tf
opbest=pbest
steps:Do k=1,nsteps
d_x0tf=pbest
!istat=cudaMemcpy(d_x0tf, pbest, nsamp, cudaMemcpyHostToDevice)
call swarm<<<ceiling(real(nsamp)/tpb),tpb>>>&
(d_x0tf,d_xf,d_jpbest)
istat=cudaMemcpy(jpbest, d_jpbest, nsamp,cudaMemcpyDeviceToHost)
!call jupdate<<<ceiling(real(nsamp)/tpb),tpb>>>(d_xk0tf,d_xf,d_jpbest,d_nsamp)
jopbest=jpbest
!write(,) jopbest
call minandpos(jpbest,gbestval,gbestloc,nsamp)
gbest(1)=pbest(gbestloc,1)
gbest(2)=pbest(gbestloc,2)
write(,),k,gbest,gbestval
seed=seed1+2int(secnds(tsec))
!$OMP Do
seed=seed1i1+2int(secnds(tsec))
!call random_seed()
do i1=1, nsamp
seed=seed1+2int(i1secnds(tsec))
ran1(i1)=ran(seed)
seed=seed1+2int(2i1secnds(tsec))
ran2(i1)=ran(seed)
seed=seed1+2int(3i1secnds(tsec))
ran3(i1)=ran(seed)
seed=seed1+2int(4i1secnds(tsec))
ran4(i1)=ran(seed)
seed=seed1+2int(5i1secnds(tsec))
ran5(i1)=ran(seed)
seed=seed1+2int(6i1*secnds(tsec))
ran6(i1)=ran(seed)
end do
Do i1=1,nsamp
!write(,),i1, ran1(i1)
wci(i1,1)=(1.d0+ran1(i1))/2.
!write(,),i1, ran2(i1)
wci(i1,2)=(1.d0+ran2(i1))/2.
!write(,),i1, ran3(i1)
wcc(i1,1)=1.49445d0ran3(i1)
!write(,),i1, ran5(i1)
wcc(i1,2)=1.49445d0ran4(i1)
!write(,),i1, ran4(i1)
wcs(i1,1)=1.49445d0ran5(i1)
!write(,),i1, ran6(i1)
wcs(i1,2)=1.49445d0ran6(i1)
wk(i1,1)=wci(i1,1)wk0(i1,1)+wcc(i1,1)(pbest(i1,1)-xk0tfin(i1,1))&
+wcs(i1,1)*(gbest(1)-xk0tfin(i1,1))
wk(i1,2)=wci(i1,2)wk0(i1,2)+wcc(i1,2)(pbest(i1,2)-xk0tfin(i1,2))&
+wcs(i1,2)*(gbest(2)-xk0tfin(i1,2))
if (wk(i1,1)<-d(1)) then
wk(i1,1)=-d(1)
end if
if (wk(i1,1)>d(1)) then
wk(i1,1)=d(1)
end if
xk0tf(i1,1)=xk0tfin(i1,1)+wk(i1,1)
if (xk0tf(i1,1)<a(1)) then
xk0tf(i1,1)=a(1)
wk(i1,1)=0.0
end if
if (xk0tf(i1,1)>b(1)) then
xk0tf(i1,1)=b(1)
wk(i1,1)=0.0
end if
if (wk(i1,2)<-d(2)) then
wk(i1,2)=-d(2)
end if
if (wk(i1,2)>d(2)) then
wk(i1,2)=d(2)
end if
xk0tf(i1,2)=xk0tfin(i1,2)+wk(i1,2)
if (xk0tf(i1,2)<a(2)) then
xk0tf(i1,2)=a(2)
wk(i1,2)=0.0
end if
if (xk0tf(i1,2)>b(2)) then
xk0tf(i1,2)=b(2)
wk(i1,2)=0.0
end if
end do
!write(,)xk0tf
d_x0tf=xk0tf
!istat=cudaMemcpy(d_x0tf, xk0tf, nsamp)
call swarm<<<ceiling(real(nsamp)/tpb),tpb>>>(d_x0tf,d_xf,d_jpbest)
istat=cudaMemcpy(jpbest, d_jpbest, nsamp,cudaMemcpyDeviceToHost)
!jpbest=d_jpbest
!write(,) xk0tf
Do i1=1,nsamp
if(jopbest(i1) .lt. jpbest(i1)) then
pbest(i1,1)=opbest(i1,1)
pbest(i1,2)=opbest(i1,2)
opbest(i1,1)=pbest(i1,1)
opbest(i1,2)=pbest(i1,2)
else
pbest(i1,1)=xk0tf(i1,1)
pbest(i1,2)=xk0tf(i1,2)
opbest(i1,1)=pbest(i1,1)
opbest(i1,2)=pbest(i1,2)
end if
end DO
wk0=wk
xk0tfin=xk0tf
opbest=pbest
end do steps
deallocate(tfinal,d_pbest,d_jpbest,d_jopbest)
end program psologoGPU
subroutine subintervals(a1,b1,n1,ak1,bk1)
use precision_m
use nswarm_modXT
implicit none
integer, intent(in)::n1
real(kind=dp), intent(in),dimension(2)::a1,b1
real(kind=dp), intent(out),dimension(n1,2)::ak1,bk1
integer::i
real(kind=dp), dimension(2)::h
h(1)=real((b1(1)-a1(1))/n1)
h(2)=real((b1(2)-a1(2))/n1)
ak1(1,1)=a1(1)
ak1(1,2)=a1(2)
bk1(n1,1)=b1(1)
bk1(n1,2)=b1(2)
Do i=1,n1-1
ak1(i+1,1)=ak1(i,1)+h(1)
ak1(i+1,2)=ak1(i,2)+h(2)
bk1(i,1)=ak1(i+1,1)
bk1(i,2)=ak1(i+1,2)
end do
end subroutine subintervals
subroutine minandpos(tab,mini,mpos,dim1)
implicit none
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(15)
integer, intent(in)::dim1
real(kind=dp), intent(out)::mini
integer, intent(out)::mpos
real(kind=dp), intent(in), dimension(dim1)::tab
integer::m
mini=tab(1)
Do m=2,dim1
if(mini>tab(m)) then
mini=tab(m)
end if
end do
Do m=1,dim1
if(mini==tab(m)) then
mpos=m
end if
end do
end subroutine minandpos
- supporting modules:
(a) nswarm_modXT:
module nswarm_modXT
use precision_m
use curand_device
implicit none
!INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(10)
integer, parameter::nsamp=32
integer, parameter:: nvar1=3,neq1=3
real(kind=dp), parameter,value::jacabiC=3.0d0! Jacobi constant for L1
integer,parameter, value::p=5,pcap=4
real,device,allocatable::tfinal(:)
!real(kind=dp),dimension(7)::alpha=(/0.0, 1.0/5.0,3.0/10.0,4.0/5.0,8.0/9.0,1.0,1.0 /)
contains
attributes(global) subroutine swarm(dx0tf,dxf,djpbest)
implicit none
real(kind=dp), dimension(3):: x,x0,xcap,xk0
real(kind=dp), intent(in),dimension(nsamp,2)::dx0tf
real(kind=dp), intent(out),dimension(nsamp)::djpbest
real(kind=dp), value::dt,t0,t,tmp,etol,&
hopt,hnew,hmin,hmax,facmax,facmin,fac,err2
!4.334959
real(kind=dp):: pi=3.141592653589793238462d0
real(kind=dp), intent(out),dimension(nsamp,3):: dxf
real(kind=dp), dimension(3):: atol, rtol
integer:: i,i1,q
allocate(tfinal(nsamp))
atol(1:3)=1.0E-1610.**1/3.
rtol(1:3)= atol10.1/3.
etol=1.0E-16
dxf(nsamp,3)=0
q=min(p,pcap)
i1=(blockIdx%x-1)*blockDim%x+threadIdx%x
if(i1<=nsamp) then
!err2=0.0
!jdo:Do i1=1,nsamp1
dt=0.00001D0
t0=0
t=0
facmax=3.d0
facmin=1.6d0
fac=.84
!fac(i1)=(0.25D0)(1.0d0/(real(q)+1.0D0))
x0(1)=dx0tf(i1,1)
!0.768714
x0(2)=0.0d0
x0(3)=pi/2.
tfinal(i1)=dx0tf(i1,2)
!tfinal=4.334959d0
rk45do:Do
if(t .eq. 0) then
call hctrl(x0,x,xcap,atol,rtol,t,dt,err2)
call klmn(x0,x,xcap,dt)
t=t+dt
hmin=dt/64.D0
hmax=64.d0dt
!write(,*) t, x
x0(1)=x(1)
x0(2)=x(2)
x0(3)=x(3)
!d_xf=xfdv
else
call hctrl(x0,x,xcap,atol,rtol,t,dt,err2)
!write(,) err2
hnew=dtmin(hmax,max(hmin,&
fac(etol/err2)**(1.0/(real(q)+1.0))))
if(err2 .le. etol .and. facmax .le. 5.0) then
!write(,) 'Accepted steps ',dt
call klmn(x0,x,xcap,dt)
t=t+dt
tmp=t
x0(1)=x(1)
x0(2)=x(2)
x0(3)=x(3)
dxf(i1,1)=x0(1)
dxf(i1,2)=x0(2)
dxf(i1,3)=x0(3)
!write(,) t, x
facmax=facmax+.2D0
if (err2 .eq. 0.0) then
hopt=0.0
else
hopt=hnew
end if
if (hopt.lt. 0.75d0) then
dt=max(hopt4.d0,64.d0hmin)
else if (hopt .gt. 1.5d0) then
dt=min(4.d0hopt,1.6d0)
end if
if (dt .gt. hmax) then
dt=dt/2.
else if(tmp .gt. tfinal(i1)) then
dt=tfinal(i1)-dt
if(dt>hmax) exit
end if
else
call klmn(x0,x,xcap,dt)
dt=hnew
facmax=1.d0
!write(,*) 'rejected steps ',dt
if(t>tfinal(i1)) exit
end if
end if
end do rk45do
call syncthreads()
!write(,) ‘Aceepted steps=’,nac, 'rejected steps= ',nrej
!end Do jdo
djpbest(i1)=abs(dxf(i1,1)-dx0tf(i1,1))+abs(dxf(i1,2))&
+abs(min(mod(dxf(i1,3)-pi/2.0,2.0pi),mod(-(dxf(i1,3)-pi/2.0),2.0pi)))
end if
!call syncthreads()
!call jupdate(d_x0tf,d_xf,d_jpbest)
call syncthreads()
end subroutine swarm
attributes(device) subroutine jupdate(dx0tf,dxf,djpbest)
implicit none
real(kind=dp), intent(out), dimension(nsamp)::djpbest
real(kind=dp),intent(in), dimension(nsamp,3)::dxf
real(kind=dp),intent(in), dimension(nsamp,2):: dx0tf
real(kind=dp):: pi=3.141592653589793238462d0
integer::i1
i1=(blockIdx%x-1)blockDim%x+threadIdx%x
if(i1<=nsamp) then
djpbest(i1)=abs(dxf(i1,1)-dx0tf(i1,1))+abs(dxf(i1,2))&
+abs(min(mod(dxf(i1,3)-pi/2.0,2.0pi),mod(-(dxf(i1,3)-pi/2.0),2.0pi)))
!write(,*) xf
end if
call syncthreads()
end subroutine jupdate
attributes(device) subroutine &
hctrl(x01,x1,x1cap,atol1,rtol1,t01,dt0,err1)
implicit none
real(kind=dp), intent(in), dimension(nvar1):: atol1,rtol1
real(kind=dp):: d0,d1,d2,h0,h1,dsum=0.0,d1sum=0.0,t01
real(kind=dp),intent(inout), dimension(nvar1)::x01,x1,x1cap
real(kind=dp), dimension(nvar1)::f=0.0,f0, f1,sc
real(kind=dp),intent(out)::dt0,err1
!fac=0.8,0.9, (0.25)(1.0/(q+1.0)), or (0.38)(1.0/(q+1.0))
integer::q,i
q=min(p,pcap)
call eqm(x01,f)
if (t01 .eq. 0.0) then
f0=f
Do i=1,nvar1
sc(i)=atol1(i)+abs(x01(i))rtol1(i)
dsum=dsum+(x01(i)/sc(i))**2
d1sum=d1sum+(f0(i)/sc(i))*2
end do
d0=sqrt(dsum/nvar1)
d1=sqrt(d1sum/nvar1)
dsum=0.0
d1sum=0.0
if(d1 .lt. 1.0E-6 .or. d0 .lt. 1.0E-6) then
h0=1.0E-6
else
h0=0.01(d0/d1)
end if
x01=x01+h0f0
call eqm(x01,f)
f1=f
Do i=1,nvar1
sc(i)=atol1(i)+max(abs(f1(i)),abs(f0(i)))rtol1(i)
dsum=dsum+((f1(i)-f0(i))/sc(i))**2
end do
d2=sqrt(dsum/nvar1)/h0
if(max(d1,d2).le. 1.0E-6 ) then
h1=max(1.0d-6,h01.0d-3)
else
h1=(0.01d0/max(d1,d2))**(1.0/(real(q)+1.0))
end if
dt0=min(100.0d0h0,h1)
!write(,) dt0
!call klmn(x01,x1,x1cap,para1,dt0)
else
err1=0.0D0
!call klmn(x01,x1,x1cap,para1,dt0)
Do i=1,nvar1
sc(i)=atol1(i)+max(abs(x01(i)), abs(x1cap(i)))rtol1(i)
err1=err1+((x1(i)-x1cap(i))/sc(i))**2
end do
err1=sqrt(err1/nvar1)
!write(,) err1
end if
end subroutine hctrl
attributes(device) subroutine klmn(y0,y1,y1cap,dtt)
implicit none
real(kind=dp), intent(in)::dtt
real(kind=dp),intent(in),dimension(3)::y0
real(kind=dp),intent(out),dimension(3)::y1,y1cap
real(kind=dp), dimension(3)::f1,sum1,sum2
real(kind=dp),dimension(7,3)::k
!real(kind=dp),dimension(3),shared::k1,k2,k3,k4,k5,k6,k7
real(kind=dp), dimension(7)::ccap=(/ 5179.0/57600.0,0.0,7571.0/16695.0,&
393.0/640.0,-92097.0/339200.0,187.0/2100.0,1.0/40.0 /)
real(kind=dp),dimension(7)::crk=(/35.0/384.0,0.0,500.0/1113.0,&
125.0/192.0,-2187.0/6784.0,11.0/84.0, 0.0/)
real(kind=dp), dimension(7,7)::b
integer::i,j
!allocate(b(7,7))
b(1,1:7)=0.0
b(2,2:7)=0.0
b(3,3:7)=0.0
b(4,4:7)=0.0
b(5,5:7)=0.0
b(6,6:7)=0.0
b(7,7)=0.0
b(2,1)=1.0/5.0
b(3,1)=3.0/40.0
b(4,1)=44.0/45.0
b(5,1)=19372.0/6561.0
b(6,1)=9017.0/3168.0
b(7,1)=35.0/384.0
b(3,2)=9.0/40.0
b(4,2)=-56.0/15.0
b(5,2)=-25360.0/2187.0
b(6,2)=-355.0/33.0
b(7,2)=0.0
b(4,3)=32.0/9.0
b(5,3)=64448.0/6561.0
b(6,3)=46732.0/5247.0
b(7,3)=500.0/1113.0
b(5,4)=-212.0/729.0
b(6,4)=49.0/176.0
b(7,4)=125.0/192.0
b(6,5)=-5103.0/18656.0
b(7,5)=-2187.0/6784.0
b(7,6)=11.0/84.0
y1=y0
!write(,) beta(2,2)
Do i=2,7
sum1=0
call eqm(y1,f1)
k(i-1,1)=f1(1)
k(i-1,2)=f1(2)
k(i-1,3)=f1(3)
Do,j=1,i-1
sum1(1)=sum1(1)+b(i,j)*k(j,1)
sum1(2)=sum1(2)+b(i,j)*k(j,2)
sum1(3)=sum1(3)+b(i,j)k(j,3)
End Do
y1=y0+dttsum1
End Do
call eqm(y0,f1)
k(7,:)=f1
Do,i=1,7
sum1=sum1+crk(i)k(i,:)
sum2=sum2+ccap(i)k(i,:)
End Do
y1=y0+dttsum1
y1cap=y0+dttsum2
!write(,) y1-y1cap
end subroutine klmn
attributes(device) subroutine eqm(var,f)
implicit none
real(kind=dp), intent(in),dimension(3), device ::var
real(kind=dp),intent(out),dimension(3), device ::f
real(kind=dp), shared:: mu=0.01215510d0
real(kind=dp), device ::r1,r2,omx,omy,om
r1=sqrt((var(1)+mu)2+var(2)2)
r2=sqrt((var(1)+mu-1.d0)2+var(2)2)
omx=var(1)-((1.d0-mu)*(var(1)+mu))/r13.-((mu)*(var(1)+mu-1.d0))/r23.
omy=var(2)-(1.d0-mu)*var(2)/r13-(mu)*var(2)/r23
om=(var(1)**2+var(2)**2)/2.d0+(1.d0-mu)/r1+mu/r2
f(1)=cos(var(3))sqrt(2.d0om-jacabiC)
f(2)=sin(var(3))sqrt(2.d0om-jacabiC)
f(3)=(omycos(var(3))-omxsin(var(3)))/sqrt(2.d0*om-jacabiC)-2.d0
end subroutine eqm
end module nswarm_modXT
2(b) Module for precision_m:
module precision_m
integer, parameter::singlePrecision=kind(0.0)
integer, parameter::doublePrecision=kind(0.0d0)
!integer, parameter::dp=singlePrecision
integer, parameter::dp=doublePrecision
end module precision_m