accelerating a fortran code

Hello,
I have the following fortran90 code which I would like to accelerate using pgfortran:


IMPLICIT NONE

INTEGER, PARAMETER :: wp=KIND(1.0D0)
integer:: Npoints=1000
integer:: plpnl=50
integer:: pltnl=50
integer:: ipenl=1000
integer:: nwi=64
integer:: ipbnl=340
integer:: plnl=2300
integer:: ii,jj
real(wp):: rand


integer :: ip,ip1
integer :: io
COMPLEX(wp),dimension(:,:),allocatable :: integrand_prep_J
real(wp),dimension(:,:),allocatable :: real_integrand_prep_J
real(wp),dimension(:,:),allocatable :: imag_integrand_prep_J
COMPLEX(wp),dimension(:,:,:),allocatable :: integrand_prep_J_ip
REAL(wp) :: Coef
integer :: i_theC,i_phiC
REAL(wp),DIMENSION(3,3) :: matrice
REAL(wp),DIMENSION(:), allocatable :: wi
REAL(wp),DIMENSION(:,:), allocatable :: rprime_iwi1
REAL(wp),DIMENSION(:,:,:), allocatable :: C_grid_rhat
REAL(wp),DIMENSION(:,:,:,:), allocatable :: M12_surf_sph_obs
REAL(wp),DIMENSION(:,:,:,:), allocatable :: M12_sph_obs_obs

INTEGER :: i_wi
COMPLEX(wp) :: integrand_th,integrand_ph
COMPLEX(wp),DIMENSION(3) :: integrand_prep_J_iwi
complex(wp) :: E_theC,E_phiC,ctmp(3)
COMPLEX(wp),DIMENSION(3) :: integrand_J
COMPLEX(wp) :: deph_transl
REAL(wp) :: scalC
real(wp) :: rtmp(3),itmp(3),Ru1,Ru2,Ru3,Iu1,Iu2,Iu3
COMPLEX(wp),dimension(:,:,:,:),allocatable :: Uc

ALLOCATE(wi(nwi))
ALLOCATE(rprime_iwi1(3,nwi))
ALLOCATE(integrand_prep_J(3,nwi))
ALLOCATE(real_integrand_prep_J(3,nwi))
ALLOCATE(imag_integrand_prep_J(3,nwi))
ALLOCATE(integrand_prep_J_ip(3,nwi,ipeNL))
ALLOCATE(Uc(3,ipeNL,PLpNL,PLtNL))
ALLOCATE(M12_surf_sph_obs(PLpNL,PLtNL,3,3))
ALLOCATE(C_grid_rhat(3,PLpNL,PLtNL))
ALLOCATE(M12_sph_obs_obs(PLpNL,PLtNL,3,3))

do ip=1,Npoints
do i_wi=1,nwi
do ii=1,3
integrand_prep_J(ii,i_wi)=cmplx(rand(),rand(),wp)
integrand_prep_J_ip(ii,i_wi,ip)=integrand_prep_J(ii,i_wi)
enddo
enddo
enddo
do i_phiC=1,PLpNL
do i_theC=1,PLtNL
do jj=1,3
C_grid_rhat(jj,i_phiC,i_theC)=rand()
do ii=1,3
M12_surf_sph_obs(i_phiC,i_theC,ii,jj)=rand()
enddo
enddo
enddo
enddo

do i_wi=1,nwi
do ii=1,3
rprime_iwi1(ii,i_wi)=-1+rand()
enddo
enddo

!$acc region
do 10 ip=1,Npoints
io=ip
integrand_prep_J(:,:)=integrand_prep_J_ip(:,:,ip)
real_integrand_prep_J=real(integrand_prep_J)
imag_integrand_prep_J=imag(integrand_prep_J)


do 11 i_phiC=1,PLpNL
do 12 i_theC=1,PLtNL

matrice = M12_surf_sph_obs(i_phiC,i_theC,:,:)
E_theC = (0._wp,0._wp)
E_phiC = (0._wp,0._wp)

do i_wi=1,nwi
Rtmp(:)=real_integrand_prep_J(:,i_wi)
Itmp(:)=imag_integrand_prep_J(:,i_wi)
Ru1=matrice(1,1)*Rtmp(1) + matrice(1,2)*Rtmp(2) + matrice(1,3)*Rtmp(3)
Iu1=matrice(1,1)*Itmp(1) + matrice(1,2)*Itmp(2) + matrice(1,3)*Itmp(3)
Ru2=matrice(2,1)*Rtmp(1) + matrice(2,2)*Rtmp(2) + matrice(2,3)*Rtmp(3)
Iu2=matrice(2,1)*Itmp(1) + matrice(2,2)*Itmp(2) + matrice(2,3)*Itmp(3)
Ru3=matrice(3,1)*Rtmp(1) + matrice(3,2)*Rtmp(2) + matrice(3,3)*Rtmp(3)
Iu3=matrice(3,1)*Itmp(1) + matrice(3,2)*Itmp(2) + matrice(3,3)*Itmp(3)
integrand_J(1) = cmplx(Ru1,Iu1,wp)
integrand_J(2) = cmplx(Ru2,Iu2,wp)
integrand_J(3) = cmplx(Ru3,Iu3,wp)

scalC= C_grid_rhat(1,i_phiC,i_theC)*rprime_iwi1(1,i_wi)
scalC=scalC+C_grid_rhat(2,i_phiC,i_theC)*rprime_iwi1(2,i_wi)
scalC=scalC+C_grid_rhat(3,i_phiC,i_theC)*rprime_iwi1(3,i_wi)

ru3=cos(scalC)
iu3=sin(scalC)
integrand_th=cmplx(ru1ru3-iu1iu3,ru3iu1+ru1iu3,wp)
integrand_ph=cmplx(ru2ru3-iu2iu3,ru3iu2+ru2iu3,wp)

E_theC = E_theC + integrand_thwi(i_wi)
E_phiC = E_phiC + integrand_ph
wi(i_wi)
end do
matrice = M12_sph_obs_obs(i_phiC,i_theC,:,:)
Rtmp(1)=Real(E_theC)
Itmp(1)=Imag(E_theC)
Rtmp(2)=Real(E_phiC)
Itmp(2)=Imag(E_phiC)

Ru1=matrice(1,1)*Rtmp(1) + matrice(1,2)*Rtmp(2)
Iu1=matrice(1,1)*Itmp(1) + matrice(1,2)*Itmp(2)
Ru2=matrice(2,1)*Rtmp(1) + matrice(2,2)*Rtmp(2)
Iu2=matrice(2,1)*Itmp(1) + matrice(2,2)*Itmp(2)
Ru3=matrice(3,1)*Rtmp(1) + matrice(3,2)*Rtmp(2)
Iu3=matrice(3,1)*Itmp(1) + matrice(3,2)*Itmp(2)

Uc(1,io,i_phiC,i_theC) = cmplx(Ru1,Iu1,wp)
Uc(2,io,i_phiC,i_theC) = cmplx(Ru2,Iu2,wp)
Uc(3,io,i_phiC,i_theC) = cmplx(Ru3,Iu3,wp)

12 continue
11 continue
10 continue
!$acc end region

END


I have also runtime problems when it is compiled with:
pgfortran -ta=nvidia -Minfo=accel -fast code.f90

Anyone can help?
With thanks,
Barak Galanti

Hi Barak,

In looking at the compiler feedback messages (-Minfo=accel), you’ll see that your main issue is that you need to privatize several arrays. Arrays are shared by default.

You could put these arrays in a private clause, but if possible, I would try using the “integrand_prep_J_ip” array directly instead of creating the temp arrays. I’m not sure the algorithm allows for it, but that would be my first choice in porting this.

  • Mat
% pgf90 -acc -Minfo=accel test.f90
MAIN:
     76, Generating present_or_copyout(integrand_prep_j(1:3,1:64))
         Generating present_or_copyin(integrand_prep_j_ip(1:3,1:64,1:1000))
         Generating present_or_copyout(real_integrand_prep_j(1:3,1:64))
         Generating present_or_copyout(imag_integrand_prep_j(1:3,1:64))
         Generating present_or_copyin(m12_sph_obs_obs(1:50,1:50,1:3,1:3))
         Generating present_or_copyout(itmp(:))
         Generating present_or_copyout(rtmp(:))
         Generating present_or_copyin(rprime_iwi1(1:3,1:64))
         Generating present_or_copyin(c_grid_rhat(1:3,1:50,1:50))
         Generating present_or_copyin(wi(1:64))
         Generating present_or_copyin(m12_surf_sph_obs(1:50,1:50,1:3,1:3))
         Generating present_or_copyout(matrice(:,:))
         Generating present_or_copyout(uc(1:3,1:1000,1:50,1:50))
         Generating Tesla code
     77, Parallelization would require privatization of array 'matrice(:,:)'
         Parallelization would require privatization of array 'itmp(:)'
         Parallelization would require privatization of array 'rtmp(:)'
         Parallelization would require privatization of array 'imag_integrand_prep_j(1:3,i2+1)'
         Parallelization would require privatization of array 'real_integrand_prep_j(1:3,i2+1)'
         Parallelization would require privatization of array 'integrand_prep_j(1:3,i2+1)'
         Accelerator kernel generated
         79, !$acc loop vector(32) ! threadidx%x
             Interchanging generated strip mine loop outwards
             Interchanging generated vector loop outwards
         80, !$acc loop vector(32) ! threadidx%x
             Interchanging generated strip mine loop outwards
             Interchanging generated vector loop outwards
         87, !$acc loop vector(32) ! threadidx%x
             Interchanging generated strip mine loop outwards
             Interchanging generated vector loop outwards
         92, !$acc loop vector(32) ! threadidx%x
        116, !$acc loop vector(32) ! threadidx%x
             Interchanging generated strip mine loop outwards
             Interchanging generated vector loop outwards
     79, Loop is parallelizable
     80, Loop is parallelizable
     84, Parallelization would require privatization of array 'matrice(:,:)'
         Parallelization would require privatization of array 'itmp(:)'
         Parallelization would require privatization of array 'rtmp(:)'
     85, Complex loop carried dependence of 'rtmp' prevents parallelization
         Complex loop carried dependence of 'itmp' prevents parallelization
         Parallelization would require privatization of array 'matrice(:,:)'
         Parallelization would require privatization of array 'itmp(:)'
         Parallelization would require privatization of array 'rtmp(:)'
     87, Loop is parallelizable
     91, Scalar last value needed after loop for 'e_thec' at line 117
         Scalar last value needed after loop for 'e_thec' at line 118
         Scalar last value needed after loop for 'e_phic' at line 119
         Scalar last value needed after loop for 'e_phic' at line 120
         Accelerator restriction: scalar variable live-out from loop: e_phic
         Accelerator restriction: scalar variable live-out from loop: e_thec
         Parallelization would require privatization of array 'itmp(:)'
         Parallelization would require privatization of array 'rtmp(:)'
     92, Loop is parallelizable
    116, Loop is parallelizable