Openmp compile by using PGI Fortran and Intel Fortran

I used the Intel and PGI Fortran to compile the same code as following. However, the results calculated by using the PGI Fortran are wrong. Is anyone can’t help me to find out the problem? Or it’s the limitation of PGI Fortan? Thanks a lot.

!------------------------------------------------------------
module parameters
  real(kind=8):: a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,b5,b6
  real(kind=8):: rpm,u,vis,diam,b,pi,c,err1,err2,p_norm,norm,omega,delta
  real(kind=8):: h2,h3,dx,d2x,dy,d2y,dx2,dy2,lr,br
  real(kind=8):: pa,prr
  integer:: x1,x2,x3,x4,x5,x6,x7,x8,y1,y2    ! recess coord.
  integer:: n_x,n_y,n_x1,n_x2                  ! recess coord. parameters
  real(kind=8)::alpha,dxr,dyr                   ! recess coord. parameters
  integer::nx,ny,ip,im,jp,jm,iter_in
end module parameters
!------------------------------------------------------------
Program pso_abrg      ! Particle Swarm Algorithm
implicit none
!A program for hydrodynmic lubrication recessed slider bearing optimization by using  PSO 
! Nv: No. of variables
! Np: Population size (number of particles)
! v: pseudovelocity
! c1: Cognitive trust parameter
! c2: Social trust parameter
! w0: Inertia
! wd:Inertia Reduction parameter
! k: Bound on velocity fraction
! vd: Velocity reduction parameter
! d: Dynamic inertia/velocity reduction delay (function evaluations)
! r1,r2: Random factors in the [0,1] interval
! f: Function value
! ik: iteration times
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
integer::i,j,Nv,Np,d,ik,ikmax,it,ic,n,icmax,tpso1,tpso2,tpsor,thn
real(kind=8)::c1,c2,w0,wd,k,vd,g_f,g_f_old
real(kind=8), allocatable::x_UB(:),x_LB(:),v_x_max(:),g_x(:),g_x_new(:),r1(:),r2(:)
real(kind=8), allocatable::v_x(:,:),x(:,:),x_best(:,:),f(:),f_best(:),p_x(:,:)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!$ integer:: omp_get_num_threads
!$ integer:: omp_get_num_procs
!$ thn=omp_get_num_procs()
!$ write(*,*) "The number of available processors/threads in the system: ",thn
thn=1    ! when OpenMP is not used
!$ write(*,*) "Enter the number of threads"
!$ read(*,*) thn
!$ call omp_set_num_threads(thn)             ! set the number of threads
!write(*,*)'Np=  (e.g. 100)'
!read(*,*)Np
!write(*,*)'icmax=  (e.g. 20)'
!read(*,*)icmax
Np=60
icmax=60
Nv=3

c1=1.0d0  !2.0d0
c2=1.0d0  !2.0d0
w0=0.5d0
wd=0.05d0
k=0.2d0    !*******************
vd=0.05d0
d=5
ikmax=20 !****************************

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
allocate(x_UB(1:Nv),x_LB(1:Nv),g_x(1:Nv),g_x_new(1:Nv),v_x(1:Nv,1:Np),v_x_max(1:Nv))
allocate(x(1:Nv,1:Np),x_best(1:Nv,1:Np),p_x(1:Nv,1:Np))
allocate(f(1:Np),f_best(1:Np),r1(1:Np),r2(1:Np))
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


g_f=0.0d0

x_UB(1)=60.0d-6
x_LB(1)=40.0d-6
x_UB(2)=30.0d-3
x_LB(2)=10.0d-3
x_UB(3)=60.0d-3
x_LB(3)=40.0d-3
do i=1,Nv
   v_x_max(i)=k*(x_UB(i)-x_LB(i))
end do
ik=0
it=0
ic=0
do i=1,5
call random_number(x)
call random_number(v_x)
end do
g_f=1000000
do i=1,Nv
    x(i,:)=x(i,:)*(x_UB(i)-x_LB(i))+x_LB(i)
end do

do i=1,Nv
     v_x(i,:)=v_x(i,:)*v_x_max(i)
end do
call obfun(Nv,Np,x,f,thn)
f_best=f
x_best=x
p_x=x_best

do j=1,Np
     if (f_best(j)<=g_f) then
         g_x(:)=x_best(:,j)
         g_f=f_best(j)
     end if
end do

open(1,file='track_x.dat')
open(2,file='d_f.dat')
call system_clock(tpso1,tpsor)
do n=1,ikmax
     g_f_old=g_f
     write(2,'(1000(1x,f20.4))') g_x,-g_f
     ik=ik+1
     do i=1,Nv
          call random_number(r1)
          call random_number(r2)
          v_x(i,:)=w0*v_x(i,:)+c1*r1(:)*(p_x(i,:)-x(i,:))+c2*r2(:)*(g_x(i)-x(i,:))
     end do
     do i=1,Nv
         do j=1,Np
              if (v_x(i,j)>v_x_max(i)) then
                  v_x(i,j)=v_x_max(i)
              end if    
              x(i,j)=x(i,j)+v_x(i,j)
               if (x(i,j)<=x_LB(i)) then; x(i,j)=x_LB(i); end if
               if (x(i,j)>=x_UB(i)) then; x(i,j)=x_UB(i); end if
         end do
     end do
     write(1,'(1000(1x,f10.4))') x
     call obfun(Nv,Np,x,f,thn)
     do j=1,Np
          if (f(j)<=f_best(j)) then
              f_best(j)=f(j)
              x_best(:,j)=x(:,j)
              p_x(:,j)=x_best(:,j)
          end if
     end do
     do j=1,Np
          if (f_best(j)<=g_f) then
              g_f=f_best(j)
              g_x_new(:)=x_best(:,j)
          end if
     end do
     if (abs((g_f-g_f_old)/g_f)>1d-5) then
         it=0
     end if
     it=it+1
     g_x=g_x_new
         !write(*,*) 'g_x: ',g_x
     if (it==d) then
         w0=w0*(1.0d0-wd)
         do i=1,Nv
             v_x_max(i)=v_x_max(i)*(1.0d0-vd)
         end do
         it=0
         ic=ic+1
     end if
     if (ic==icmax) exit
     write(*,*) 'ik/g_x/g_f: ',  ik, g_x, g_f
end do
call system_clock(tpso2,tpsor)
close(1)
close(2)
write(*,*)'elapsed_time (s)=',real(tpso2-tpso1)/real(tpsor)
write(*,*)'g_x, g_f=',g_x, g_f
write(*,*)'ik=',ik

stop
end program pso_abrg

!------------------------------------------------------
subroutine obfun(Nv,Np,x,f,thn)
use parameters
implicit none
integer,intent(in)::Nv,Np,thn
real(kind=8), intent(in)::x(1:Nv,1:Np)
real(kind=8), intent(out)::f(1:Np)
integer:: psoi
real(kind=8)::op,xv(Nv,1)

!$omp parallel do 
    do psoi=1,Np
        xv(:,1)=x(:,psoi)
        call airbear(Nv,xv,op)
        f(psoi)=-op
        write(*,*) 'pi,x,f:  ', psoi,x(:,psoi),f(psoi)
    end do
!$omp end parallel do     
return
end subroutine obfun
!------------------------------------------------------
subroutine airbear(Nv,xv,op)
use parameters
implicit none
integer,intent(in)::Nv
real(kind=8),intent(in)::xv(Nv,1)
real(kind=8), intent(out)::op
real(kind=8),allocatable::p(:,:)
real(kind=8)::eps,load,phi
        c=xv(1,1)
        lr=xv(2,1)
        diam=xv(3,1)
        call constant
        call recess_coord                   ! define the recess coordinates
        allocate(p(0:nx,0:ny))
        p=pa
        eps=0.7d0
        call sor(p,eps)
        call load_brg(p,load,phi)
        !pave=load/(diam*b)
        op=load
end subroutine airbear
!------------------------------------------------------
subroutine constant
use parameters
implicit none

pi=4.0d0*atan(1.0d0)
nx=256
ny=nint(nx/pi)
if (mod(ny,2)/=0) then; ny=ny+1; end if

vis=179.0d-7                    ! air viscosity (ns/m^2)
!c=50.0d-6                       ! clearance (m)
!diam=50.0d-3                 ! bearing diameter
b=50.0d-3                       ! bearing  width (m)
err1=1.0d-5                      ! allowable errors (Inner iteration)
err2=1.0d-4                     ! allowable errors (Outer iteration)
pa=101347.0d0                ! atmosphere pressure (N/m^2)
rpm=6.0d4                     ! rotation speed (rpm)
u=rpm*diam*pi/60.d0  ! bearing velocity
omega=1.6                      ! relaxation no.
alpha=50.0d0                 ! bearing orientation angle (0<alpha<90, deg)
dxr=pi*diam/real(nx)    ! x-grid size
dyr=b/real(ny)               ! y-grid size
!lr=10d-3                          ! recess length (m)
br=40.0d-3                     ! recess width (m)
prr=4.5                           ! recess pressure (atm)
return;end subroutine constant
!------------------------------------------------------
subroutine sor(p,eps)
use parameters
implicit none

real(kind=8), intent(in)::eps
real(kind=8), intent(inout)::p(0:nx,0:ny)
real(kind=8)::de(0:nx,0:ny),de_old(0:nx,0:ny),p_old(0:nx,0:ny),h(0:nx,0:ny)
real(kind=8)::h0,rjac,theta
integer::i,j,iter1,iter2,nt,thread,i1,j1,k

dx=(diam*pi/real(nx))
dy=(b/real(ny))
rjac=(cos(pi/nx)+(dx/dy)**2*cos(pi/ny))/(1.0d0+(dx/dy)**2)
!-------------- film thickness -------------------------------
do i=0,nx ; theta=360.0/real(nx)*real(i)
   do j=0,ny
      h(i,j)=c*(1.0d0+eps*cos(pi/180.0d0*theta))
   end do
end do
!---------------------------------------------------------------
h0=minval(h)
!-----------normalization-----------
dx=dx/b
dy=dy/b
h=h/h0
p=p*(h0**2)/(vis*b*u)   ! ambient pressure, kg/m**2
dx2=dx**2; dy2=dy**2; d2x=2.0d0*dx; d2y=2.0d0*dy
!-------------------------------------
iter_in=0
iter2=0
p_norm=1.0d0
do while(p_norm > err2 .and. iter2<200)       ! outer-loop stopping criteria (pressure)
   iter2=iter2+1
   de=0.0d0
   norm=1.0d0
   p_old=p
   iter1=0
   do i=1, nx
        do j=1,ny
            if (j>=y1 .and. j<=y2) then
                if (x1>x2) then
                    if (i>=x1) p(i,j)=0
                        if (i<=x2) p(i,j)=0
                    else
                        if (i>=x1 .and. i<=x2) p(i,j)=0
                    end if
                    if (i>=x3 .and. i<=x4) p(i,j)=0
                    if (i>=x5 .and. i<=x6) p(i,j)=0
                    if (x8<x7) then
                        if (i>=x7) p(i,j)=0
                        if (i<=x8) p(i,j)=0
                    else
                    if (i>=x7 .and. i<=x8) p(i,j)=0
                end if
            end if
        end do
   end do

   do while (norm> err1 .and. iter1<5000)        ! inner-loop stopping criteria  (delta)
      de_old=de
      do i1=1,2
         do j1=1,2
            do i=i1,nx,2; ip=i+1; im=i-1
              if (i==nx) then; ip=1; end if
                 do j=j1,ny-1,2; jp=j+1; jm=j-1
                    h2=h(i,j)**2; h3=h(i,j)**3
                    b1=(h(ip,j)-h(im,j))/d2x      ! dh/dx
                    b2=(h(i,jp)-h(i,jm))/d2y      ! dh/dy
                    b3=(p(ip,j)-p(im,j))/d2x      ! dp/dx
                    b4=(p(i,jp)-p(i,jm))/d2y      ! dp/dy
                    b5=(p(ip,j)+p(im,j)-2.d0*p(i,j))/dx2      ! dp2/dx2
                    b6=(p(i,jp)+p(i,jm)-2.d0*p(i,j))/dy2      ! dp2/dy2
                    a1=(p(i,j)*h3)/(dx2)+(2.0d0*h3*b3+3.0d0*h2*p(i,j)*b1)/d2x-(6.0d0*h(i,j))/d2x
                    a2=(p(i,j)*h3)/(dx2)-(2.0d0*h3*b3+3.0d0*h2*p(i,j)*b1)/d2x+(6.0d0*h(i,j))/d2x
                    a3=(p(i,j)*h3)/(dy2)+(2.0d0*h3*b4+3.0d0*h2*p(i,j)*b2)/d2y
                    a4=(p(i,j)*h3)/(dy2)-(2.0d0*h3*b4+3.0d0*h2*p(i,j)*b2)/d2y
                    a5=(-2.0d0*p(i,j)*h3)/dx2+h3*b5+3.0d0*h2*b1*b3-(2.0d0*p(i,j)*h3)/dy2+h3*b6+3.0d0*h2*b4*b2-6.0d0*b1
                    a6=p(i,j)*h3*(b5+b6)+h3*(b3**2+b4**2)+3.0d0*h2*p(i,j)*(b1*b3+b2*b4)-6.0d0*h(i,j)*b3-6.0d0*p(i,j)*b1
                    delta=-(a1*de(ip,j)+a2*de(im,j)+a3*de(i,jp)+a4*de(i,jm)+a6)/a5-de(i,j)
                    de(i,j)=de(i,j)+omega*delta
                    if (j>=y1 .and. j<=y2) then
                        if (x1>x2) then
                           if (i>=x1) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                           if (i<=x2) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                        else
                           if (i>=x1 .and. i<=x2) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                        end if
                        if (i>=x3 .and. i<=x4) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                        if (i>=x5 .and. i<=x6) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                        if (x8<x7) then
                           if (i>=x7) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                           if (i<=x8) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                        else
                           if (i>=x7 .and. i<=x8) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                        end if
                    end if
                 end do
              end do
         end do
      end do
   iter1=iter1+1
   norm=sqrt(dx*dy*sum((de-de_old)**2))
   end do
   iter_in=iter_in+iter1
   p=p+de
   p(0,:)=p(nx,:)
   p_norm=sqrt(dx*dy*sum((p-p_old)**2))
end do

p=p*(vis*b*u)/h0**2
dx=dx*b
dy=dy*b

return;end subroutine sor
!---------------------------
subroutine load_brg(p,load,phi)  ! get the horizontal/vertical forces (fx,fy)
use parameters
implicit none
real(kind=8), intent(in):: p(0:nx,0:ny)
real(kind=8), intent(out):: load,phi
real(kind=8):: theta,px(0:nx,0:ny),py(0:nx,0:ny),fx,fy
integer:: i,j

do j=0,ny
   do i=0,nx
      theta=real(i)*360.0d0/real(nx)
      px(i,j)=p(i,j)*sin(pi/180.0d0*theta)
      py(i,j)=-p(i,j)*cos(pi/180.0d0*theta)
   end do
end do

call simps(px,fx)
call simps(py,fy)

load=sqrt(fx**2+fy**2)
phi=atan(fx/fy)*180.0d0/pi

return; end subroutine load_brg
!----------------------
subroutine simps(pn,fn) ! Simpson's integration formula
use parameters
implicit none
real(kind=8), intent(in):: pn(0:nx,0:ny)
real(kind=8), intent(out):: fn
real(kind=8):: f1,f2,s1(0:ny),sx,sy
integer:: i,j

sx=dx/3.0d0
sy=dy/3.0d0

do j=0,ny
   f1=0.0d0;f2=0.0d0

   do i=1,nx-1,2; f1=f1+pn(i,j); end do
   do i=2,nx-1,2; f2=f2+pn(i,j); end do

   s1(j)=sx*(pn(0,j)+pn(nx,j)+4.0d0*f1+2.0d0*f2)
end do
f1=0.0d0
f2=0.0d0
do j=1,ny-1,2; f1=f1+s1(j); end do
do j=2,ny-1,2; f2=f2+s1(j); end do
fn=sy*(s1(0)+s1(ny)+4.0d0*f1+2.0d0*f2)
return; end subroutine simps
!-----------------------------------------------
subroutine recess_coord

! calculate the locations of the four recesses
! alpha: the orientation angle of the hybrid journal bearing

use parameters
implicit none

n_x=nint(lr/2.0d0/dxr)
n_x1=nint(alpha*real(nx)/360.0d0)
n_x2=nint(90.0d0*real(nx)/360.0d0)

x1=n_x1-n_x
x3=x1+n_x2

if (x1 < 0) then
   x1=n_x1-n_x+nx
   x3=x1+n_x2-nx
end if

x2=n_x1+n_x
x4=x2+n_x2
x5=x3+n_x2
x6=x4+n_x2
x7=x5+n_x2
x8=x6+n_x2

if (x8 > nx) then
   x8=x6+n_x2-nx
end if

n_y=nint(br/2.0d0/dyr)
y1=ny/2-n_y
y2=ny/2+n_y

return; end subroutine recess_coord

Hi Chia-WenChan52986,

If I’m reading your code correctly, x and v_x are initialized to random values. Since you are not setting the random seed, the same random set of initial values are used. However, when moving compilers, a different set of values will be created since a different default seed and possibly a different random number generating algorithm is used. Both the PGI and Intel results are correct based on the initial values given.

If you wish to have portable results, you will need to either use a fixed set of initial values or use your own random generator to produce the same set of values.

Hope this helps,
Mat

Hi Mat,
Thanks for your reply. I forgot to write down that if I run the code compiled by PGI Fortran by using only one processor-core, the results are correct. However, the results will be wrong by using multi-core. Of course, the results are all correct by using single- and multi-core while the code is compiled by the Intel compiler. Do you have any idea about this? Thanks a lot.

Hey Mat,
I don’t think the random no. will affect the results. If you run the code by using the compile commend pgfortran -mp=allcores filename, and using 8 cores. The results will be: (where x is the variables, and f is the result)





However, if the compile commend is ifort /Qopenmp filename. The results will be like this:





I think it’s compiler’s problem.

[/img]

Hi Chia-WenChan52986,

Thanks for the further explanation. To be clear, you expect different answer from each build given you’re using random numbers to generate your initial values.

For the OpenMP “NaN” issue, this is being caused by a race condition in your code. You have each thread set the values of the parameter module. However, the default for module variables is “shared”, hence the threads are overwriting each others values causing the bad answers. To fix, you need to add the “threadprivate” clause in your module. Also, you need to privatize xv and op since all variables (except for loop index) are shared by default.

Why Intel works is not clear. It may be that how they schedule the threads happens to avoid the race condition?

  • Mat
!------------------------------------------------------------ 
 module parameters 
   real(kind=8):: a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,b5,b6 
   real(kind=8)::rpm,u,vis,diam,b,pi,c,err1,err2,p_norm,norm,omega,delta
   real(kind=8):: h2,h3,dx,d2x,dy,d2y,dx2,dy2,lr,br 
   real(kind=8):: pa,prr 
   integer:: x1,x2,x3,x4,x5,x6,x7,x8,y1,y2    ! recess coord. 
   integer:: n_x,n_y,n_x1,n_x2                  ! recess coord.
   real(kind=8)::alpha,dxr,dyr                   ! recess coord.
   integer::nx,ny,ip,im,jp,jm,iter_in 
 !$omp  threadprivate(a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,b5,b6) 
 !$omp  threadprivate(rpm,u,vis,diam,b,pi,c,err1,err2,p_norm,norm,omega,delta) 
 !$omp  threadprivate(h2,h3,dx,d2x,dy,d2y,dx2,dy2,lr,br,pa,prr) 
 !$omp  threadprivate(x1,x2,x3,x4,x5,x6,x7,x8,y1,y2) 
 !$omp  threadprivate(n_x,n_y,n_x1,n_x2) 
 !$omp  threadprivate(alpha,dxr,dyr) 
 !$omp  threadprivate(nx,ny,ip,im,jp,jm,iter_in) 
 end module parameters 
 !------------------------------------------------------------ 
 Program pso_abrg      ! Particle Swarm Algorithm 
 use omp_lib
 implicit none 
 !A program for hydrodynmic lubrication recessed slider bearing
 !optimization by using  PSO 
 ! Nv: No. of variables 
 ! Np: Population size (number of particles) 
 ! v: pseudovelocity 
 ! c1: Cognitive trust parameter 
 ! c2: Social trust parameter 
 ! w0: Inertia 
 ! wd:Inertia Reduction parameter 
 ! k: Bound on velocity fraction 
 ! vd: Velocity reduction parameter 
 ! d: Dynamic inertia/velocity reduction delay (function evaluations) 
 ! r1,r2: Random factors in the [0,1] interval 
 ! f: Function value 
 ! ik: iteration times 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 integer::i,j,Nv,Np,d,ik,ikmax,it,ic,n,icmax,tpso1,tpso2,tpsor,thn 
 real(kind=8)::c1,c2,w0,wd,k,vd,g_f,g_f_old 
 real(kind=8),allocatable::x_UB(:),x_LB(:),v_x_max(:),g_x(:),g_x_new(:),r1(:),r2(:)
 real(kind=8),allocatable::v_x(:,:),x(:,:),x_best(:,:),f(:),f_best(:),p_x(:,:)
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 !integer:: omp_get_num_threads 
 !integer:: omp_get_num_procs 
 ! thn=omp_get_num_procs() 
 ! write(*,*) "The number of available processors/threads in the
 !system: ",thn 
 thn=1    ! when OpenMP is not used 
 write(*,*) "Enter the number of threads" 
 read(*,*) thn 
 call omp_set_num_threads(thn)             ! set the number of
 !threads 
 !write(*,*)'Np=  (e.g. 100)' 
 !read(*,*)Np 
 !write(*,*)'icmax=  (e.g. 20)' 
 !read(*,*)icmax 
 Np=60 
 icmax=60 
 Nv=3 

 c1=1.0d0  !2.0d0 
 c2=1.0d0  !2.0d0 
 w0=0.5d0 
 wd=0.05d0 
 k=0.2d0    !******************* 
 vd=0.05d0 
 d=5 
 ikmax=20 !**************************** 

 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 allocate(x_UB(1:Nv),x_LB(1:Nv),g_x(1:Nv),g_x_new(1:Nv),v_x(1:Nv,1:Np),v_x_max(1:Nv)) 
 allocate(x(1:Nv,1:Np),x_best(1:Nv,1:Np),p_x(1:Nv,1:Np)) 
 allocate(f(1:Np),f_best(1:Np),r1(1:Np),r2(1:Np)) 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 


 g_f=0.0d0 

 x_UB(1)=60.0d-6 
 x_LB(1)=40.0d-6 
 x_UB(2)=30.0d-3 
 x_LB(2)=10.0d-3 
 x_UB(3)=60.0d-3 
 x_LB(3)=40.0d-3 
 do i=1,Nv 
    v_x_max(i)=k*(x_UB(i)-x_LB(i)) 
 end do 
 ik=0 
 it=0 
 ic=0
 do i=1,5 
 call random_number(x) 
 call random_number(v_x) 
 end do
 g_f=1000000 
 do i=1,Nv 
     x(i,:)=x(i,:)*(x_UB(i)-x_LB(i))+x_LB(i) 
 end do 

 do i=1,Nv 
      v_x(i,:)=v_x(i,:)*v_x_max(i) 
 end do 
 call obfun(Nv,Np,x,f,thn) 
 f_best=f 
 x_best=x 
 p_x=x_best 

 do j=1,Np 
      if (f_best(j)<=g_f) then 
          g_x(:)=x_best(:,j) 
          g_f=f_best(j) 
      end if 
 end do 

 open(1,file='track_x.dat') 
 open(2,file='d_f.dat') 
 call system_clock(tpso1,tpsor) 
 do n=1,ikmax 
      g_f_old=g_f 
      write(2,'(1000(1x,f20.4))') g_x,-g_f 
      ik=ik+1 
      do i=1,Nv 
           call random_number(r1) 
           call random_number(r2) 
           v_x(i,:)=w0*v_x(i,:)+c1*r1(:)*(p_x(i,:)-x(i,:))+c2*r2(:)*(g_x(i)-x(i,:)) 
      end do 
      do i=1,Nv 
          do j=1,Np 
               if (v_x(i,j)>v_x_max(i)) then 
                   v_x(i,j)=v_x_max(i) 
               end if    
               x(i,j)=x(i,j)+v_x(i,j) 
                if (x(i,j)<=x_LB(i)) then; x(i,j)=x_LB(i); end if 
                if (x(i,j)>=x_UB(i)) then; x(i,j)=x_UB(i); end if 
          end do 
      end do 
      write(1,'(1000(1x,f10.4))') x 
      call obfun(Nv,Np,x,f,thn) 
      do j=1,Np 
           if (f(j)<=f_best(j)) then 
               f_best(j)=f(j) 
               x_best(:,j)=x(:,j) 
               p_x(:,j)=x_best(:,j) 
           end if 
      end do 
      do j=1,Np 
           if (f_best(j)<=g_f) then 
               g_f=f_best(j) 
               g_x_new(:)=x_best(:,j) 
           end if 
      end do 
      if (abs((g_f-g_f_old)/g_f)>1d-5) then 
          it=0 
      end if 
      it=it+1 
      g_x=g_x_new 
          !write(*,*) 'g_x: ',g_x 
      if (it==d) then 
          w0=w0*(1.0d0-wd) 
          do i=1,Nv 
              v_x_max(i)=v_x_max(i)*(1.0d0-vd) 
          end do 
          it=0 
          ic=ic+1 
      end if 
      if (ic==icmax) exit 
      write(*,*) 'ik/g_x/g_f: ',  ik, g_x, g_f 
 end do 
 call system_clock(tpso2,tpsor) 
 close(1) 
 close(2) 
 write(*,*)'elapsed_time (s)=',real(tpso2-tpso1)/real(tpsor) 
 write(*,*)'g_x, g_f=',g_x, g_f 
 write(*,*)'ik=',ik 

 stop 
 end program pso_abrg 

 !------------------------------------------------------ 
 subroutine obfun(Nv,Np,x,f,thn) 
 use parameters 
 implicit none 
 integer,intent(in)::Nv,Np,thn 
 real(kind=8), intent(in)::x(1:Nv,1:Np) 
 real(kind=8), intent(out)::f(1:Np) 
 integer:: psoi 
 real(kind=8)::op,xv(Nv,1) 

 !$omp parallel do &
 !$omp  private (xv,op) 
     do psoi=1,Np 
         op = 1.0
         xv(:,1)=x(:,psoi) 
         call airbear(Nv,xv,op) 
         f(psoi)=-op 
         write(*,*) 'pi,x,f:  ', psoi,x(:,psoi),f(psoi) 
     end do 
 !$omp end parallel do      
 return 
 end subroutine obfun 
 !------------------------------------------------------ 
 subroutine airbear(Nv,xv,op) 
 use parameters 
 implicit none 
 integer,intent(in)::Nv 
 real(kind=8),intent(in)::xv(Nv,1) 
 real(kind=8), intent(out)::op 
 real(kind=8),allocatable::p(:,:) 
 real(kind=8)::eps,load,phi 
         c=xv(1,1) 
         lr=xv(2,1) 
         diam=xv(3,1) 
         call constant 
         call recess_coord                   ! define the recess
         allocate(p(0:nx,0:ny)) 
         p=pa 
         eps=0.7d0 
         call sor(p,eps) 
         call load_brg(p,load,phi) 
         !pave=load/(diam*b) 
         op=load 
 end subroutine airbear 
 !------------------------------------------------------ 
 subroutine constant 
 use parameters 
 implicit none 

 pi=4.0d0*atan(1.0d0) 
 nx=256 
 ny=nint(nx/pi) 
 if (mod(ny,2)/=0) then; ny=ny+1; end if 

 vis=179.0d-7                    ! air viscosity (ns/m^2) 
 !c=50.0d-6                       ! clearance (m) 
 !diam=50.0d-3                 ! bearing diameter 
 b=50.0d-3                       ! bearing  width (m) 
 err1=1.0d-5                      ! allowable errors (Inner iteration) 
 err2=1.0d-4                     ! allowable errors (Outer iteration) 
 pa=101347.0d0                ! atmosphere pressure (N/m^2) 
 rpm=6.0d4                     ! rotation speed (rpm) 
 u=rpm*diam*pi/60.d0  ! bearing velocity 
 omega=1.6                      ! relaxation no. 
 alpha=50.0d0                 ! bearing orientation angle (0<alpha<90,
 dxr=pi*diam/real(nx)    ! x-grid size 
 dyr=b/real(ny)               ! y-grid size 
 !lr=10d-3                          ! recess length (m) 
 br=40.0d-3                     ! recess width (m) 
 prr=4.5                           ! recess pressure (atm) 
 return;end subroutine constant 
 !------------------------------------------------------ 
 subroutine sor(p,eps) 
 use parameters 
 implicit none 

 real(kind=8), intent(in)::eps 
 real(kind=8), intent(inout)::p(0:nx,0:ny) 
 real(kind=8)::de(0:nx,0:ny),de_old(0:nx,0:ny),p_old(0:nx,0:ny),h(0:nx,0:ny) 
 real(kind=8)::h0,rjac,theta 
 integer::i,j,iter1,iter2,nt,thread,i1,j1,k 

 dx=(diam*pi/real(nx)) 
 dy=(b/real(ny)) 
 rjac=(cos(pi/nx)+(dx/dy)**2*cos(pi/ny))/(1.0d0+(dx/dy)**2) 
 !-------------- film thickness ------------------------------- 
 do i=0,nx ; theta=360.0/real(nx)*real(i) 
    do j=0,ny 
       h(i,j)=c*(1.0d0+eps*cos(pi/180.0d0*theta)) 
    end do 
 end do 
 !--------------------------------------------------------------- 
 h0=minval(h) 
 !-----------normalization----------- 
 dx=dx/b 
 dy=dy/b 
 h=h/h0 
 p=p*(h0**2)/(vis*b*u)   ! ambient pressure, kg/m**2 
 dx2=dx**2; dy2=dy**2; d2x=2.0d0*dx; d2y=2.0d0*dy 
 !------------------------------------- 
 iter_in=0 
 iter2=0 
 p_norm=1.0d0 
 do while(p_norm > err2 .and. iter2<200)       ! outer-loop stopping
    iter2=iter2+1 
    de=0.0d0 
    norm=1.0d0 
    p_old=p 
    iter1=0 
    do i=1, nx 
         do j=1,ny 
             if (j>=y1 .and. j<=y2) then 
                 if (x1>x2) then 
                     if (i>=x1) p(i,j)=0 
                         if (i<=x2) p(i,j)=0 
                     else 
                         if (i>=x1 .and. i<=x2) p(i,j)=0 
                     end if 
                     if (i>=x3 .and. i<=x4) p(i,j)=0 
                     if (i>=x5 .and. i<=x6) p(i,j)=0 
                     if (x8<x7) then 
                         if (i>=x7) p(i,j)=0 
                         if (i<=x8) p(i,j)=0 
                     else 
                     if (i>=x7 .and. i<=x8) p(i,j)=0 
                 end if 
             end if 
         end do 
    end do 

    do while (norm> err1 .and. iter1<5000)        ! inner-loop stopping
       de_old=de 
       do i1=1,2 
          do j1=1,2 
             do i=i1,nx,2; ip=i+1; im=i-1 
               if (i==nx) then; ip=1; end if 
                  do j=j1,ny-1,2; jp=j+1; jm=j-1 
                     h2=h(i,j)**2; h3=h(i,j)**3 
                     b1=(h(ip,j)-h(im,j))/d2x      ! dh/dx 
                     b2=(h(i,jp)-h(i,jm))/d2y      ! dh/dy 
                     b3=(p(ip,j)-p(im,j))/d2x      ! dp/dx 
                     b4=(p(i,jp)-p(i,jm))/d2y      ! dp/dy 
                     b5=(p(ip,j)+p(im,j)-2.d0*p(i,j))/dx2      ! dp2/dx2 
                     b6=(p(i,jp)+p(i,jm)-2.d0*p(i,j))/dy2      ! dp2/dy2 
                     a1=(p(i,j)*h3)/(dx2)+(2.0d0*h3*b3+3.0d0*h2*p(i,j)*b1)/d2x-(6.0d0*h(i,j))/d2x 
                     a2=(p(i,j)*h3)/(dx2)-(2.0d0*h3*b3+3.0d0*h2*p(i,j)*b1)/d2x+(6.0d0*h(i,j))/d2x 
                     a3=(p(i,j)*h3)/(dy2)+(2.0d0*h3*b4+3.0d0*h2*p(i,j)*b2)/d2y 
                     a4=(p(i,j)*h3)/(dy2)-(2.0d0*h3*b4+3.0d0*h2*p(i,j)*b2)/d2y 
                     a5=(-2.0d0*p(i,j)*h3)/dx2+h3*b5+3.0d0*h2*b1*b3-(2.0d0*p(i,j)*h3)/dy2+h3*b6+3.0d0*h2*b4*b2-6.0d0*b1 
                     a6=p(i,j)*h3*(b5+b6)+h3*(b3**2+b4**2)+3.0d0*h2*p(i,j)*(b1*b3+b2*b4)-6.0d0*h(i,j)*b3-6.0d0*p(i,j)*b1 
                     delta=-(a1*de(ip,j)+a2*de(im,j)+a3*de(i,jp)+a4*de(i,jm)+a6)/a5-de(i,j) 
                     de(i,j)=de(i,j)+omega*delta 
                     if (j>=y1 .and. j<=y2) then 
                         if (x1>x2) then 
                            if (i>=x1) de(i,j)=prr*pa*(h0**2)/(vis*b*u) 
                            if (i<=x2) de(i,j)=prr*pa*(h0**2)/(vis*b*u) 
                         else 
                 if (i>=x1 .and. i<=x2) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                         end if 
                 if (i>=x3 .and. i<=x4) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                 if (i>=x5 .and. i<=x6) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                         if (x8<x7) then 
                            if (i>=x7) de(i,j)=prr*pa*(h0**2)/(vis*b*u) 
                            if (i<=x8) de(i,j)=prr*pa*(h0**2)/(vis*b*u) 
                         else 
                 if (i>=x7 .and. i<=x8) de(i,j)=prr*pa*(h0**2)/(vis*b*u)
                         end if 
                     end if 
                  end do 
               end do 
          end do 
       end do 
    iter1=iter1+1 
    norm=sqrt(dx*dy*sum((de-de_old)**2)) 
    end do 
    iter_in=iter_in+iter1 
    p=p+de 
    p(0,:)=p(nx,:) 
    p_norm=sqrt(dx*dy*sum((p-p_old)**2)) 
 end do 

 p=p*(vis*b*u)/h0**2 
 dx=dx*b 
 dy=dy*b 

 return;end subroutine sor 
 !--------------------------- 
 subroutine load_brg(p,load,phi)  ! get the horizontal/vertical forces
 use parameters 
 implicit none 
 real(kind=8), intent(in):: p(0:nx,0:ny) 
 real(kind=8), intent(out):: load,phi 
 real(kind=8):: theta,px(0:nx,0:ny),py(0:nx,0:ny),fx,fy 
 integer:: i,j 

 do j=0,ny 
    do i=0,nx 
       theta=real(i)*360.0d0/real(nx) 
       px(i,j)=p(i,j)*sin(pi/180.0d0*theta) 
       py(i,j)=-p(i,j)*cos(pi/180.0d0*theta) 
    end do 
 end do 

 call simps(px,fx) 
 call simps(py,fy) 

 load=sqrt(fx**2+fy**2) 
 phi=atan(fx/fy)*180.0d0/pi 

 return; end subroutine load_brg 
 !---------------------- 
 subroutine simps(pn,fn) ! Simpson's integration formula 
 use parameters 
 implicit none 
 real(kind=8), intent(in):: pn(0:nx,0:ny) 
 real(kind=8), intent(out):: fn 
 real(kind=8):: f1,f2,s1(0:ny),sx,sy 
 integer:: i,j 

 sx=dx/3.0d0 
 sy=dy/3.0d0 

 do j=0,ny 
    f1=0.0d0;f2=0.0d0 

    do i=1,nx-1,2; f1=f1+pn(i,j); end do 
    do i=2,nx-1,2; f2=f2+pn(i,j); end do 

    s1(j)=sx*(pn(0,j)+pn(nx,j)+4.0d0*f1+2.0d0*f2) 
 end do 
 f1=0.0d0 
 f2=0.0d0 
 do j=1,ny-1,2; f1=f1+s1(j); end do 
 do j=2,ny-1,2; f2=f2+s1(j); end do 
 fn=sy*(s1(0)+s1(ny)+4.0d0*f1+2.0d0*f2) 
 return; end subroutine simps 
 !----------------------------------------------- 
 subroutine recess_coord 

 ! calculate the locations of the four recesses 
 ! alpha: the orientation angle of the hybrid journal bearing 

 use parameters 
 implicit none 

 n_x=nint(lr/2.0d0/dxr) 
 n_x1=nint(alpha*real(nx)/360.0d0) 
 n_x2=nint(90.0d0*real(nx)/360.0d0) 

 x1=n_x1-n_x 
 x3=x1+n_x2 

 if (x1 < 0) then 
    x1=n_x1-n_x+nx 
    x3=x1+n_x2-nx 
 end if 

 x2=n_x1+n_x 
 x4=x2+n_x2 
 x5=x3+n_x2 
 x6=x4+n_x2 
 x7=x5+n_x2 
 x8=x6+n_x2 

 if (x8 > nx) then 
    x8=x6+n_x2-nx 
 end if 

 n_y=nint(br/2.0d0/dyr) 
 y1=ny/2-n_y 
 y2=ny/2+n_y 

 return; end subroutine recess_coord 

% pgfortran -mp test.f90 -fast ; a.out
 Enter the number of threads
8
 pi,x,f:             25   5.9516665980696078E-005   2.8437097496238260E-002
   4.6029907490028846E-002   -1.342359780510695
 pi,x,f:              9   5.0088449394745570E-005   2.3686958953027498E-002
   4.2029409143188728E-002   -2.412770113034530
 pi,x,f:             17   4.4093232024295338E-005   2.6362292584158238E-002
   5.1496865681003784E-002   -8.826354187314926
 pi,x,f:             26   5.0093386960804480E-005   2.7163973133330042E-002
   4.2043941585420443E-002   -1.243699821905178
 pi,x,f:             47   5.8831031247096063E-005   2.1460884697961261E-002
   4.1440365191234889E-002   -2.711834122652949
...

Hey mkcolg,
Thanks for your help to solve this problem. I think the Intel compiler also can’t do it very well. If the comment “!$omp threadprivate ()” wasn’t applied, few errors occur when the program was executed at the begging. Therefore, the variables should be allocated in the thread privately while using both compiler. Thanks again.