The following is my cuda code.
! Linerize Reynolds Equation by Newton’s Method
! Air Foil Journal Bearing
! Domain partitioning method with OpenMP is used to parallize SOR method
! iterative methods for simple Reynolds eq.
module sor_rb_mod
use cudafor
contains
!-------------------------- SOR for GPU --------------------------------
attributes(device) subroutine sor_kernel(i,j,nx,ny,pdev,hdev,dx,dy,omega,pddev)
implicit none
real(kind=8),shared :: pdev(0:nx,0:ny),hdev(0:nx,0:ny), pddev(0:nx,0:ny)
real(kind=8):: a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,b5,b6,h2,h3
real(kind=8), value:: omega,delta,dx,d2x,dy,d2y,dx2,dy2
integer, value :: i,j,nx, ny,ip,im,jp,jm,np
ip=i+1; im=i-1
if (i==nx) then; ip=1; end if
jp=j+1; jm=j-1
dx2=dx2; dy2=dy2; d2x=2.0d0dx; d2y=2.0d0dy
h2=hdev(i,j)2; h3=hdev(i,j)3
b1=(hdev(ip,j)-hdev(im,j))/d2x ! dh/dx
b2=(hdev(i,jp)-hdev(i,jm))/d2y ! dh/dy
b3=(pdev(ip,j)-pdev(im,j))/d2x ! dp/dx
b4=(pdev(i,jp)-pdev(i,jm))/d2y ! dp/dy
b5=(pdev(ip,j)+pdev(im,j)-2.d0pdev(i,j))/dx2 ! dp2/dx2
b6=(pdev(i,jp)+pdev(i,jm)-2.d0pdev(i,j))/dy2 ! dp2/dy2
a1=(pdev(i,j)h3)/(dx2)+(2.0d0h3b3+3.0d0h2pdev(i,j)b1)/d2x-(6.0d0hdev(i,j))/d2x
a2=(pdev(i,j)h3)/(dx2)-(2.0d0h3b3+3.0d0h2pdev(i,j)b1)/d2x+(6.0d0hdev(i,j))/d2x
a3=(pdev(i,j)h3)/(dy2)+(2.0d0h3b4+3.0d0h2pdev(i,j)b2)/d2y
a4=(pdev(i,j)h3)/(dy2)-(2.0d0h3b4+3.0d0h2pdev(i,j)b2)/d2y
a5=(-2.0d0pdev(i,j)h3)/dx2+h3b5+3.0d0h2b1b3-(2.0d0pdev(i,j)h3)/dy2+h3b6+3.0d0h2b4b2-6.0d0b1
a6=pdev(i,j)h3(b5+b6)+h3(b32+b42)+3.0d0h2pdev(i,j)(b1b3+b2b4)-6.0d0hdev(i,j)b3-6.0d0pdev(i,j)b1
delta=-(a1pddev(ip,j)+a2pddev(im,j)+a3pddev(i,jp)+a4pddev(i,jm)+a6)/a5-pddev(i,j)
pddev(i,j)=pddev(i,j)+omegadelta
end subroutine sor_kernel
!-------------------------- Red block --------------------------------
attributes(global) subroutine red_kernel(nx,ny,pdev,hdev,dx,dy,omega,pddev)
implicit none
real(kind=8),shared :: pdev(0:nx,0:ny), hdev(0:nx,0:ny), pddev(0:nx,0:ny)
real(kind=8), value:: dx,dy,omega
integer, value :: nx, ny
integer :: i, j
i = (blockidx%x-1) * 16 + threadidx%x
j = (blockidx%y-1) * 16 + threadidx%y
if (mod((i+j),2)==0) then
call sor_kernel(i,j,nx,ny,pdev,hdev,dx,dy,omega,pddev)
end if
end subroutine red_kernel
!-------------------------- Black block -------------------------------
attributes(global) subroutine black_kernel(nx,ny,pdev,hdev,dx,dy,omega,pddev)
implicit none
real(kind=8),shared :: pdev(0:nx,0:ny), hdev(0:nx,0:ny), pddev(0:nx,0:ny)
real(kind=8), value:: dx,dy,omega
integer, value :: nx, ny
integer :: i, j
i = (blockidx%x-1) * 16 + threadidx%x
j = (blockidx%y-1) * 16 + threadidx%y
if (mod((i+j),2)==1) then
call sor_kernel(i,j,nx,ny,pdev,hdev,dx,dy,omega,pddev)
end if
end subroutine black_kernel
!-------------------------- Sor GPU ------------------------------------
subroutine sor2(p,eps)
use parameters
implicit none
real(kind=8), device, allocatable, dimension(:,:) :: pdev,hdev,pddev
type(dim3) :: dimGrid, dimBlock
integer :: i,j,r,iter,t3,t4,tr
real(kind=8), intent(in)::eps
real(kind=8), intent(inout)::p(0:nx,0:ny)
real(kind=8)::de(0:nx,0:ny),de_old(0:nx,0:ny),p_old(0:nx,0:ny),h(0:nx,0:ny)
real(kind=8)::h0,rjac,theta,tim
integer::iter1,iter2,nt,thread,i1,j1,k
dx=(diampi/real(nx))
dy=(b/real(ny))
rjac=(cos(pi/nx)+(dx/dy)**2cos(pi/ny))/(1.0d0+(dx/dy)2)
!-------------- film thickness -------------------------------
do i=0,nx ; theta=360.0/real(nx)real(i)
do j=0,ny
h(i,j)=c(1.0d0+epscos(pi/180.0d0theta))
end do
end do
!---------------------------------------------------------------
h0=minval(h)
!-----------normalization-----------
dx=dx/b
dy=dy/b
h=h/h0
p=p(h02)/(visbu) ! ambient pressure, kg/m*2
!-------------------------------------
iter_in=0
iter2=0
p_norm=1.0d0
iter=0
allocate( pdev(0:nx,0:ny),hdev(0:nx,0:ny),pddev(0:nx,0:ny))
hdev=h
pdev=p
dimGrid = dim3( nx/16, ny/16, 1 )
dimBlock = dim3( 16, 16, 1 )
!------------------------ GPU --------------------------------------
call system_clock(t3,tr)
do while(p_norm > err2) ! outer-loop stopping criteria (pressure)
iter2=iter2+1
de=0.0d0
pddev=0.0d0
norm=1.0d0
p_old=p
pdev=p
iter1=0
do while (norm> err1 .and. iter1<5000) ! inner-loop stopping criteria (delta)
de_old=de
call red_kernel<<<dimGrid,dimBlock>>>(nx,ny,pdev,hdev,dx,dy,omega,pddev)
r = cudathreadsynchronize()
call black_kernel<<<dimGrid,dimBlock>>>(nx,ny,pdev,hdev,dx,dy,omega,pddev)
r = cudathreadsynchronize()
de=pddev
iter1=iter1+1
norm=sqrt(dxdysum((de-de_old)*2))
end do
iter_in=iter_in+iter1
p=p+de
p(0,:)=p(nx,:)
p_norm=sqrt(dxdysum((p-p_old)2))
end do
r = cudathreadsynchronize()
!---------------------- End GPU ------------------------------------
p=p(visbu)/h02
dx=dxb
dy=dyb
deallocate( pdev, hdev,pddev)
end subroutine sor2
end module sor_rb_mod
!------------------------------------------------------------
module parameters
real(kind=8):: a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,b5,b6
real(kind=8):: rpm,u,vis,diam,b,pi,c,err1,err2,p_norm,norm,omega,delta
real(kind=8):: h2,h3,dx,d2x,dy,d2y,dx2,dy2,tt1,tt2
real(kind=8):: pa
integer::nx,ny,ip,im,jp,jm,np,iter_in
end module parameters
!----------------Main Program-------------
program airbrg
use parameters
use sor_rb_mod
use cudafor
implicit none
integer::i,j,iter,t1,t2,tr
real(kind=8)::load,phi,eps,brg_num,pave,cpu1,cpu2,air_brg_num
real(kind=8),allocatable::p(:,:)
integer:: istat, num, numdevices,devnum
type(cudadeviceprop) :: prop
write (,) ‘--------------------- GPU Information ------------------------’
istat = cudaGetDeviceCount(numdevices)
istat = cudasetDevice(devnum)
do num = 0, numdevices-1
istat = cudaGetDeviceProperties(prop, num)
call printDeviceProperties(prop, num)
end do
pi=4.0d0atan(1.0d0)
write(,) ‘enter nx (e.g. 512)’
read(,) nx
write(,) ‘enter omega (e.g. 1.95)’
read(,) omega
ny=nint(nx/pi)
if (mod(ny,2)/=0) then; ny=ny+1; end if
vis=179.0d-7 ! air viscosity (ns/m^2)
c=40.0d-6 ! clearance (m)
diam=50.0d-3 ! bearing diameter
b=50.0d-3 ! bearing width (m)
err1=1.0d-5 ! allowable errors (Inner iteration)
err2=1.0d-4 ! allowable errors (Outer iteration)
pa=101347.0d0 ! N/m^2
rpm=3.0d4 ! rpm
u=rpmdiam*pi/60.d0 ! bearing velocity
!omega=1.95
allocate(p(0:nx,0:ny))
p=pa
eps=0.8d0
write(,)‘-----------------CPU Computing----------------’
call cpu_time(cpu1); call system_clock(t1,tr)
call sor(p,eps)
call cpu_time(cpu2); call system_clock(t2,tr)
tt1=cpu2-cpu1
call load_brg(p,load,phi)
pave=load/(diamb)
air_brg_num=(diam/2.0d0/c)**2epsvisrpm/pa
write(,)‘air_brg_num=’, air_brg_num
p=p/pa
!write(,)‘brg_number=’,brg_num
write(,)‘Pmax=’,maxval(p)
write(,)‘iter_sor=’,iter_in
write(,)‘CPU_time=’,cpu2-cpu1
open(1,file=‘p_cpu.dat’)
do i=0,nx
write(1,‘(1000(1x,f10.4))’) (p(i,j), j=0,ny)
end do
close(1)
!---------------------GPU-----------------------------------
write(,)‘-----------------GPU Computing----------------’
p=pa
call cpu_time(cpu1); call system_clock(t1,tr)
call sor2(p,eps)
call cpu_time(cpu2); call system_clock(t2,tr)
tt2=cpu2-cpu1
call load_brg(p,load,phi)
pave=load/(diamb)
air_brg_num=(diam/2.0d0/c)**2epsvisrpm/pa
write(,)‘air_brg_num=’, air_brg_num
p=p/pa
write(,)‘Pmax=’,maxval(p)
write(,)‘iter_sor=’,iter_in
write(,)‘CPU_time=’,cpu2-cpu1
write(,)‘Speedup=’,tt1/tt2
open(1,file=‘p_gpu.dat’)
do i=0,nx
write(1,‘(1000(1x,f10.4))’) (p(i,j), j=0,ny)
end do
close(1)
stop;end program airbrg
!-------------------------------------
subroutine sor(p,eps)
use parameters
implicit none
real(kind=8), intent(in)::eps
real(kind=8), intent(inout)::p(0:nx,0:ny)
real(kind=8)::de(0:nx,0:ny),de_old(0:nx,0:ny),p_old(0:nx,0:ny),h(0:nx,0:ny)
real(kind=8)::h0,rjac,theta
integer::i,j,iter1,iter2,nt,thread,i1,j1,k
dx=(diampi/real(nx))
dy=(b/real(ny))
rjac=(cos(pi/nx)+(dx/dy)**2cos(pi/ny))/(1.0d0+(dx/dy)2)
!-------------- film thickness -------------------------------
do i=0,nx ; theta=360.0/real(nx)real(i)
do j=0,ny
h(i,j)=c(1.0d0+epscos(pi/180.0d0theta))
end do
end do
!---------------------------------------------------------------
h0=minval(h)
!-----------normalization-----------
dx=dx/b
dy=dy/b
h=h/h0
p=p(h02)/(visbu) ! ambient pressure, kg/m2
dx2=dx2; dy2=dy*2; d2x=2.0d0dx; d2y=2.0d0dy
!-------------------------------------
iter_in=0
iter2=0
p_norm=1.0d0
do while(p_norm > err2) ! outer-loop stopping criteria (pressure)
iter2=iter2+1
de=0.0d0
norm=1.0d0
p_old=p
iter1=0
do while (norm> err1 .and. iter1<5000) ! inner-loop stopping criteria (delta)
de_old=de
do i1=1,2
do j1=1,2
do i=i1,nx,2; ip=i+1; im=i-1
if (i==nx) then; ip=1; end if
do j=j1,ny-1,2; jp=j+1; jm=j-1
h2=h(i,j)2; h3=h(i,j)3
b1=(h(ip,j)-h(im,j))/d2x ! dh/dx
b2=(h(i,jp)-h(i,jm))/d2y ! dh/dy
b3=(p(ip,j)-p(im,j))/d2x ! dp/dx
b4=(p(i,jp)-p(i,jm))/d2y ! dp/dy
b5=(p(ip,j)+p(im,j)-2.d0p(i,j))/dx2 ! dp2/dx2
b6=(p(i,jp)+p(i,jm)-2.d0p(i,j))/dy2 ! dp2/dy2
a1=(p(i,j)h3)/(dx2)+(2.0d0h3b3+3.0d0h2p(i,j)b1)/d2x-(6.0d0h(i,j))/d2x
a2=(p(i,j)h3)/(dx2)-(2.0d0h3b3+3.0d0h2p(i,j)b1)/d2x+(6.0d0h(i,j))/d2x
a3=(p(i,j)h3)/(dy2)+(2.0d0h3b4+3.0d0h2p(i,j)b2)/d2y
a4=(p(i,j)h3)/(dy2)-(2.0d0h3b4+3.0d0h2p(i,j)b2)/d2y
a5=(-2.0d0p(i,j)h3)/dx2+h3b5+3.0d0h2b1b3-(2.0d0p(i,j)h3)/dy2+h3b6+3.0d0h2b4b2-6.0d0b1
a6=p(i,j)h3(b5+b6)+h3(b32+b42)+3.0d0h2p(i,j)(b1b3+b2b4)-6.0d0h(i,j)b3-6.0d0p(i,j)b1
delta=-(a1de(ip,j)+a2de(im,j)+a3de(i,jp)+a4de(i,jm)+a6)/a5-de(i,j)
de(i,j)=de(i,j)+omegadelta
end do
end do
end do
end do
iter1=iter1+1
norm=sqrt(dxdysum((de-de_old)**2))
end do
iter_in=iter_in+iter1
p=p+de
p(0,:)=p(nx,:)
p_norm=sqrt(dxdysum((p-p_old)**2))
end do
p=p*(visbu)/h0**2
dx=dxb
dy=dyb
return;end subroutine sor
!---------------------------
!-------------------------------------
subroutine sor1(p,eps)
use parameters
implicit none
real(kind=8), intent(in)::eps
real(kind=8), intent(inout)::p(0:nx,0:ny)
real(kind=8)::de(0:nx,0:ny),de_old(0:nx,0:ny),p_old(0:nx,0:ny),h(0:nx,0:ny)
real(kind=8)::h0,rjac,theta
integer::i,j,iter1,iter2,nt,thread,i1,j1,k
dx=(diampi/real(nx))
dy=(b/real(ny))
rjac=(cos(pi/nx)+(dx/dy)**2cos(pi/ny))/(1.0d0+(dx/dy)2)
!-------------- film thickness -------------------------------
do i=0,nx ; theta=360.0/real(nx)real(i)
do j=0,ny
h(i,j)=c(1.0d0+epscos(pi/180.0d0theta))
end do
end do
!---------------------------------------------------------------
h0=minval(h)
!-----------normalization-----------
dx=dx/b
dy=dy/b
h=h/h0
p=p(h02)/(visbu) ! ambient pressure, kg/m2
dx2=dx2; dy2=dy*2; d2x=2.0d0dx; d2y=2.0d0dy
!-------------------------------------
iter_in=0
iter2=0
p_norm=1.0d0
do while(p_norm > err2) ! outer-loop stopping criteria (pressure)
iter2=iter2+1
de=0.0d0
norm=1.0d0
p_old=p
iter1=0
do while (norm> err1 .and. iter1<5000) ! inner-loop stopping criteria (delta)
de_old=de
do i=1,nx-1; ip=i+1; im=i-1
if (i==nx) then; ip=1; end if
do j=1,ny-1; jp=j+1; jm=j-1
if (mod((i+j),2)==0) then
h2=h(i,j)2; h3=h(i,j)3
b1=(h(ip,j)-h(im,j))/d2x ! dh/dx
b2=(h(i,jp)-h(i,jm))/d2y ! dh/dy
b3=(p(ip,j)-p(im,j))/d2x ! dp/dx
b4=(p(i,jp)-p(i,jm))/d2y ! dp/dy
b5=(p(ip,j)+p(im,j)-2.d0p(i,j))/dx2 ! dp2/dx2
b6=(p(i,jp)+p(i,jm)-2.d0p(i,j))/dy2 ! dp2/dy2
a1=(p(i,j)h3)/(dx2)+(2.0d0h3b3+3.0d0h2p(i,j)b1)/d2x-(6.0d0h(i,j))/d2x
a2=(p(i,j)h3)/(dx2)-(2.0d0h3b3+3.0d0h2p(i,j)b1)/d2x+(6.0d0h(i,j))/d2x
a3=(p(i,j)h3)/(dy2)+(2.0d0h3b4+3.0d0h2p(i,j)b2)/d2y
a4=(p(i,j)h3)/(dy2)-(2.0d0h3b4+3.0d0h2p(i,j)b2)/d2y
a5=(-2.0d0p(i,j)h3)/dx2+h3b5+3.0d0h2b1b3-(2.0d0p(i,j)h3)/dy2+h3b6+3.0d0h2b4b2-6.0d0b1
a6=p(i,j)h3(b5+b6)+h3(b32+b42)+3.0d0h2p(i,j)(b1b3+b2b4)-6.0d0h(i,j)b3-6.0d0p(i,j)b1
delta=-(a1de(ip,j)+a2de(im,j)+a3de(i,jp)+a4de(i,jm)+a6)/a5-de(i,j)
de(i,j)=de(i,j)+omegadelta
end if
end do
end do
do i=1,nx-1; ip=i+1; im=i-1
if (i==nx) then; ip=1; end if
do j=1,ny-1; jp=j+1; jm=j-1
if (mod((i+j),2)==1) then
h2=h(i,j)2; h3=h(i,j)3
b1=(h(ip,j)-h(im,j))/d2x ! dh/dx
b2=(h(i,jp)-h(i,jm))/d2y ! dh/dy
b3=(p(ip,j)-p(im,j))/d2x ! dp/dx
b4=(p(i,jp)-p(i,jm))/d2y ! dp/dy
b5=(p(ip,j)+p(im,j)-2.d0p(i,j))/dx2 ! dp2/dx2
b6=(p(i,jp)+p(i,jm)-2.d0p(i,j))/dy2 ! dp2/dy2
a1=(p(i,j)h3)/(dx2)+(2.0d0h3b3+3.0d0h2p(i,j)b1)/d2x-(6.0d0h(i,j))/d2x
a2=(p(i,j)h3)/(dx2)-(2.0d0h3b3+3.0d0h2p(i,j)b1)/d2x+(6.0d0h(i,j))/d2x
a3=(p(i,j)h3)/(dy2)+(2.0d0h3b4+3.0d0h2p(i,j)b2)/d2y
a4=(p(i,j)h3)/(dy2)-(2.0d0h3b4+3.0d0h2p(i,j)b2)/d2y
a5=(-2.0d0p(i,j)h3)/dx2+h3b5+3.0d0h2b1b3-(2.0d0p(i,j)h3)/dy2+h3b6+3.0d0h2b4b2-6.0d0b1
a6=p(i,j)h3(b5+b6)+h3(b32+b42)+3.0d0h2p(i,j)(b1b3+b2b4)-6.0d0h(i,j)b3-6.0d0p(i,j)b1
delta=-(a1de(ip,j)+a2de(im,j)+a3de(i,jp)+a4de(i,jm)+a6)/a5-de(i,j)
de(i,j)=de(i,j)+omegadelta
end if
end do
end do
iter1=iter1+1
norm=sqrt(dxdysum((de-de_old)**2))
end do
iter_in=iter_in+iter1
p=p+de
p(0,:)=p(nx,:)
p_norm=sqrt(dxdysum((p-p_old)**2))
end do
p=p*(visbu)/h0**2
dx=dxb
dy=dyb
return;end subroutine sor1
!---------------------------
subroutine load_brg(p,load,phi) ! get the horizontal/vertical forces (fx,fy)
use parameters
implicit none
real(kind=8), intent(in):: p(0:nx,0:ny)
real(kind=8), intent(out):: load,phi
real(kind=8):: theta,px(0:nx,0:ny),py(0:nx,0:ny),fx,fy
integer:: i,j
do j=0,ny
do i=0,nx
theta=real(i)*360.0d0/real(nx)
px(i,j)=p(i,j)sin(pi/180.0d0theta)
py(i,j)=-p(i,j)cos(pi/180.0d0theta)
end do
end do
call simps(px,fx)
call simps(py,fy)
load=sqrt(fx2+fy2)
phi=atan(fx/fy)*180.0d0/pi
return; end subroutine load_brg
!----------------------
subroutine simps(pn,fn) ! Simpson’s integration formula
use parameters
implicit none
real(kind=8), intent(in):: pn(0:nx,0:ny)
real(kind=8), intent(out):: fn
real(kind=8):: f1,f2,s1(0:ny),sx,sy
integer:: i,j
sx=dx/3.0d0
sy=dy/3.0d0
do j=0,ny
f1=0.0d0;f2=0.0d0
do i=1,nx-1,2; f1=f1+pn(i,j); end do
do i=2,nx-1,2; f2=f2+pn(i,j); end do
s1(j)=sx*(pn(0,j)+pn(nx,j)+4.0d0f1+2.0d0f2)
end do
f1=0.0d0
f2=0.0d0
do j=1,ny-1,2; f1=f1+s1(j); end do
do j=2,ny-1,2; f2=f2+s1(j); end do
fn=sy*(s1(0)+s1(ny)+4.0d0f1+2.0d0f2)
return; end subroutine simps
!-----------------------------------------------
subroutine printDeviceProperties(prop, num)
use cudafor
integer:: np
type(cudadeviceprop) :: prop
integer num
ilen = verify(prop%name, ’ ', .true.)
write (,900) "Device Number: " ,num
write (,901) "Device Name: " ,prop%name(1:ilen)
write (*,904) "Compute Capability Revision: ",prop%major,prop%minor
write (,900) "multiProcessorCount: ",prop%multiProcessorCount
if (prop%major<2) then
write (,905) "Total Cores: ",prop%multiProcessorCount, 8
else
write (,905) "Total Cores: ",prop%multiProcessorCount, 32
end if
write (,902) “Total Global Memory: “,real(prop%totalGlobalMem)/1e9,” Gbytes”
write (,900) "maxThreadsPerBlock: " ,prop%maxThreadsPerBlock
write (,903) “maxThreadsDim: " ,prop%maxThreadsDim
write (,903) "maxGridSize: " ,prop%maxGridSize
write (,902) “ClockRate: " ,real(prop%clockRate)/1e6,” GHz”
900 format (a,i0)
901 format (a,a)
902 format (a,f5.3,a)
903 format (a,2(i0,1x,‘x’,1x),i0)
904 format (a,i0,‘.’,i0)
905 format (a,i0,‘x’,i0)
return
end
I compiled it by using “pgfortran -ta=nvidia -Minfo=accel -fast hsbgpu1.cuf”. However, an error was shown as following:
“pgfortran7fheBbv25QgJ8M.obj : error LNK2019: unresolved external symbol SOR_RB
MOD_SOR_KERNEL@0 referenced in function _SOR_RB_MOD_RED_KERNEL@44
abrg_gpudp1.exe : fatal error LNK1120: 1 unresolved externals”
How can I solve this problem? Thanks a lot.