Hello Everybody,
I am trying to accelerate a simple fortran code (see below) and I get problems in execution. Here is the code:
IMPLICIT NONE
INTEGER, PARAMETER :: wp=KIND(1.0d0)
integer:: plpnl=50
integer:: pltnl=50
integer:: ipenl=1100
integer:: nwi=64
integer:: ipbnl=1
integer:: plnl=2300
integer:: ii,jj
real(wp):: rand
integer :: ip,ip1
real(wp),dimension(:,:),allocatable :: integrand_prep_J
real(wp),dimension(:,:),allocatable :: real_integrand_prep_J
real(wp),dimension(:,:),allocatable :: imag_integrand_prep_J
real(wp),dimension(:,:,:),allocatable :: integrand_prep_J_ip
real(wp),dimension(:,:,:),allocatable :: real_integrand_prep_J_ip
real(wp),dimension(:,:,:),allocatable :: imag_integrand_prep_J_ip
REAL(wp) :: Coef
integer :: i_theC,i_phiC
REAL(wp),DIMENSION(3,3) :: matrice,matrice1
REAL(wp),DIMENSION(:), allocatable :: wi
REAL(wp),DIMENSION(:,:), allocatable :: rprime_iwi1
REAL(wp),DIMENSION(:,:,:), allocatable :: C_grid_rhat
REAL(wp),DIMENSION(:,:,:,:), allocatable :: Mat1ij
REAL(wp),DIMENSION(:,:,:,:), allocatable :: Mat2ij
real(wp),DIMENSION(3) :: integrand_J
INTEGER :: i_wi
real(wp) :: integrand_th,integrand_ph
real(wp),DIMENSION(3) :: integrand_prep_J_iwi
real(wp) :: E_theC,E_phiC,ctmp(3)
real(wp) :: rE_theC,rE_phiC
real(wp) :: deph_transl
REAL(wp) :: scalC
real(wp) :: rtmp(3),itmp(3),Ru1,Ru2,Ru3,Iu1,Iu2,Iu3
real(wp) :: rtmp1(3),itmp1(3)
real(wp),dimension(:,:,:),allocatable :: Uc
ALLOCATE(wi(nwi))
ALLOCATE(rprime_iwi1(3,nwi))
ALLOCATE(integrand_prep_J(3,nwi))
ALLOCATE(real_integrand_prep_J(3,nwi))
ALLOCATE(imag_integrand_prep_J(3,nwi))
ALLOCATE(integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
ALLOCATE(real_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
ALLOCATE(imag_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
ALLOCATE(Uc(PLpNL,PLtNL,ipbnl:ipeNL))
ALLOCATE(C_grid_rhat(3,PLpNL,PLtNL))
ALLOCATE(Mat1ij(PLpNL,PLtNL,3,3))
ALLOCATE(Mat2ij(PLpNL,PLtNL,3,3))
do ip=ipbnl,ipenl
do i_wi=1,nwi
do ii=1,3
integrand_prep_J(ii,i_wi)=cmplx(rand(),rand(),wp)
integrand_prep_J_ip(ii,i_wi,ip)=integrand_prep_J(ii,i_wi)
real_integrand_prep_J_ip(ii,i_wi,ip)=real(integrand_prep_J_ip(ii,i_wi,ip))
enddo
enddo
enddo
do i_phiC=1,PLpNL
do i_theC=1,PLtNL
do jj=1,3
C_grid_rhat(jj,i_phiC,i_theC)=rand()
do ii=1,3
Mat1ij(i_phiC,i_theC,ii,jj)=rand()
Mat2ij(i_phiC,i_theC,ii,jj)=rand()
enddo
enddo
enddo
enddo
do i_wi=1,nwi
do ii=1,3
rprime_iwi1(ii,i_wi)=-1+rand()
enddo
enddo
!$acc kernels loop private(ip,real_integrand_prep_J,imag_integrand_prep_J)
do 10 ip=ipbnl,ipenl
!$acc do private(i_phic)
do 11 i_phiC=1,PLpNL
!$acc do private(matrice,i_theC,e_thec,e_phic,rtmp1,itmp1)
do 12 i_theC=1,PLtNL
matrice(:,:) = Mat2ij(i_phiC,i_theC,:,:)
re_thec=0._wp
re_phic=0._wp
!$acc do private(rtmp,itmp,i_wi,ru1,iu1,ru2,iu2,ru3,iu3,scalc,integrand_j)
do i_wi=1,nwi
Rtmp(:)=real_integrand_prep_J_ip(:,i_wi,ip)
integrand_J(1)=matrice(1,1)*Rtmp(1) + matrice(1,2)*Rtmp(2) + matrice(1,3)*Rtmp(3)
integrand_J(2)=matrice(2,1)*Rtmp(1) + matrice(2,2)*Rtmp(2) + matrice(2,3)*Rtmp(3)
scalC= C_grid_rhat(1,i_phiC,i_theC)*rprime_iwi1(1,i_wi)
scalC=scalC+C_grid_rhat(2,i_phiC,i_theC)*rprime_iwi1(2,i_wi)
scalC=scalC+C_grid_rhat(3,i_phiC,i_theC)*rprime_iwi1(3,i_wi)
integrand_th = integrand_J(1)*cos(scalc)
integrand_ph = integrand_J(2)*cos(scalc)
rE_theC = rE_theC + integrand_th*wi(i_wi)
rE_phiC = rE_phiC + integrand_ph*wi(i_wi)
end do
matrice = Mat1ij(i_phiC,i_theC,:,:)
rtmp1(1)=re_thec
rtmp1(2)=re_phic
Uc(i_phiC,i_theC,ip)=matrice(1,1)*re_thec + matrice(1,2)*re_phic
12 continue
11 continue
10 continue
END
I compile it with:
pgf90 -acc -Minfo=accel simple.f90
and when I run:
./a.out
call to cuMemAlloc returned error 700: Illegal address during kernel execution
It seems that the most inner loop causes the problems, but I don’t know how to fix it. Could someone guide me how to do it?
Many thanks,
Barak