runtime problems: call to cuMemAlloc returned error 700

Hello Everybody,
I am trying to accelerate a simple fortran code (see below) and I get problems in execution. Here is the code:

IMPLICIT NONE

INTEGER, PARAMETER :: wp=KIND(1.0d0)
integer:: plpnl=50
integer:: pltnl=50
integer:: ipenl=1100
integer:: nwi=64
integer:: ipbnl=1
integer:: plnl=2300
integer:: ii,jj
real(wp):: rand

integer :: ip,ip1
real(wp),dimension(:,:),allocatable :: integrand_prep_J
real(wp),dimension(:,:),allocatable :: real_integrand_prep_J
real(wp),dimension(:,:),allocatable :: imag_integrand_prep_J
real(wp),dimension(:,:,:),allocatable :: integrand_prep_J_ip
real(wp),dimension(:,:,:),allocatable :: real_integrand_prep_J_ip
real(wp),dimension(:,:,:),allocatable :: imag_integrand_prep_J_ip
REAL(wp) :: Coef
integer :: i_theC,i_phiC
REAL(wp),DIMENSION(3,3) :: matrice,matrice1
REAL(wp),DIMENSION(:), allocatable  :: wi
REAL(wp),DIMENSION(:,:), allocatable  :: rprime_iwi1
REAL(wp),DIMENSION(:,:,:), allocatable  :: C_grid_rhat
REAL(wp),DIMENSION(:,:,:,:), allocatable  :: Mat1ij
REAL(wp),DIMENSION(:,:,:,:), allocatable  :: Mat2ij
real(wp),DIMENSION(3) :: integrand_J


INTEGER :: i_wi
real(wp) :: integrand_th,integrand_ph
real(wp),DIMENSION(3) :: integrand_prep_J_iwi
real(wp) :: E_theC,E_phiC,ctmp(3)
real(wp) :: rE_theC,rE_phiC
real(wp) :: deph_transl
REAL(wp) :: scalC
real(wp) :: rtmp(3),itmp(3),Ru1,Ru2,Ru3,Iu1,Iu2,Iu3
real(wp) :: rtmp1(3),itmp1(3)
real(wp),dimension(:,:,:),allocatable :: Uc

      ALLOCATE(wi(nwi))
      ALLOCATE(rprime_iwi1(3,nwi))
      ALLOCATE(integrand_prep_J(3,nwi))
      ALLOCATE(real_integrand_prep_J(3,nwi))
      ALLOCATE(imag_integrand_prep_J(3,nwi))
      ALLOCATE(integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
      ALLOCATE(real_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
      ALLOCATE(imag_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
      ALLOCATE(Uc(PLpNL,PLtNL,ipbnl:ipeNL))
      ALLOCATE(C_grid_rhat(3,PLpNL,PLtNL))
      ALLOCATE(Mat1ij(PLpNL,PLtNL,3,3))
      ALLOCATE(Mat2ij(PLpNL,PLtNL,3,3))

        do ip=ipbnl,ipenl
        do i_wi=1,nwi
        do ii=1,3
        integrand_prep_J(ii,i_wi)=cmplx(rand(),rand(),wp)
        integrand_prep_J_ip(ii,i_wi,ip)=integrand_prep_J(ii,i_wi)
        real_integrand_prep_J_ip(ii,i_wi,ip)=real(integrand_prep_J_ip(ii,i_wi,ip))
        enddo
        enddo
        enddo

        do i_phiC=1,PLpNL
        do i_theC=1,PLtNL
        do jj=1,3
        C_grid_rhat(jj,i_phiC,i_theC)=rand()
        do ii=1,3
        Mat1ij(i_phiC,i_theC,ii,jj)=rand()
        Mat2ij(i_phiC,i_theC,ii,jj)=rand()
        enddo
        enddo
        enddo
        enddo

        do i_wi=1,nwi
        do ii=1,3
        rprime_iwi1(ii,i_wi)=-1+rand()
        enddo
        enddo


!$acc kernels loop private(ip,real_integrand_prep_J,imag_integrand_prep_J)
  do 10 ip=ipbnl,ipenl
!$acc do private(i_phic)
  do 11 i_phiC=1,PLpNL
!$acc do private(matrice,i_theC,e_thec,e_phic,rtmp1,itmp1)
  do 12 i_theC=1,PLtNL

   matrice(:,:) =  Mat2ij(i_phiC,i_theC,:,:)
        re_thec=0._wp
        re_phic=0._wp

!$acc do private(rtmp,itmp,i_wi,ru1,iu1,ru2,iu2,ru3,iu3,scalc,integrand_j)
  do i_wi=1,nwi

    Rtmp(:)=real_integrand_prep_J_ip(:,i_wi,ip)
    integrand_J(1)=matrice(1,1)*Rtmp(1) + matrice(1,2)*Rtmp(2) + matrice(1,3)*Rtmp(3)
    integrand_J(2)=matrice(2,1)*Rtmp(1) + matrice(2,2)*Rtmp(2) + matrice(2,3)*Rtmp(3)

    scalC=      C_grid_rhat(1,i_phiC,i_theC)*rprime_iwi1(1,i_wi)
    scalC=scalC+C_grid_rhat(2,i_phiC,i_theC)*rprime_iwi1(2,i_wi)
    scalC=scalC+C_grid_rhat(3,i_phiC,i_theC)*rprime_iwi1(3,i_wi)

    integrand_th = integrand_J(1)*cos(scalc)
    integrand_ph = integrand_J(2)*cos(scalc)

    rE_theC = rE_theC + integrand_th*wi(i_wi)
    rE_phiC = rE_phiC + integrand_ph*wi(i_wi)

  end do

        matrice =  Mat1ij(i_phiC,i_theC,:,:)
        rtmp1(1)=re_thec
        rtmp1(2)=re_phic

    Uc(i_phiC,i_theC,ip)=matrice(1,1)*re_thec + matrice(1,2)*re_phic

  12 continue
  11 continue
  10 continue

END

I compile it with:
pgf90 -acc -Minfo=accel simple.f90
and when I run:
./a.out
call to cuMemAlloc returned error 700: Illegal address during kernel execution

It seems that the most inner loop causes the problems, but I don’t know how to fix it. Could someone guide me how to do it?
Many thanks,
Barak

Hi Barak,

How about something like the following. Privatizing arrays should be avoided if possible since every thread will get there own copy. Not only does this waste a lot of memory, the memory wont be accessed contiguously and the equivalent scalar variables most likely could then be put in a register.


 use iso_c_binding
 IMPLICIT NONE
 INTEGER, PARAMETER :: wp=KIND(1.0d0)
 interface
   real(c_double) function rand () bind(C)
      use iso_c_binding
   end function rand
 end interface


 integer:: plpnl=50
 integer:: pltnl=50
 integer:: ipenl=1100
 integer:: nwi=64
 integer:: ipbnl=1
 integer:: plnl=2300
 integer:: ii,jj
 integer :: ip,ip1
 real(wp),dimension(:,:),allocatable :: integrand_prep_J
 real(wp),dimension(:,:),allocatable :: real_integrand_prep_J
 real(wp),dimension(:,:),allocatable :: imag_integrand_prep_J
 real(wp),dimension(:,:,:),allocatable :: integrand_prep_J_ip
 real(wp),dimension(:,:,:),allocatable :: real_integrand_prep_J_ip
 real(wp),dimension(:,:,:),allocatable :: imag_integrand_prep_J_ip
 REAL(wp) :: Coef
 integer :: i_theC,i_phiC
 REAL(wp),DIMENSION(:), allocatable  :: wi
 REAL(wp),DIMENSION(:,:), allocatable  :: rprime_iwi1
 REAL(wp),DIMENSION(:,:,:), allocatable  :: C_grid_rhat
 REAL(wp),DIMENSION(:,:,:,:), allocatable  :: Mat1ij
 REAL(wp),DIMENSION(:,:,:,:), allocatable  :: Mat2ij
 real(wp) :: integrand_J1, integrand_J2


 INTEGER :: i_wi
 real(wp) :: integrand_th,integrand_ph
 real(wp),DIMENSION(3) :: integrand_prep_J_iwi
 real(wp) :: E_theC,E_phiC,ctmp(3)
 real(wp) :: rE_theC,rE_phiC
 real(wp) :: deph_transl
 REAL(wp) :: scalC
 real(wp) :: rtmp1,rtmp2,rtmp3,itmp(3),Ru1,Ru2,Ru3,Iu1,Iu2,Iu3
 real(wp) :: rtmp11,rtmp12,rtmp13,itmp1(3)
 real(wp),dimension(:,:,:),allocatable :: Uc

       ALLOCATE(wi(nwi))
       ALLOCATE(rprime_iwi1(3,nwi))
       ALLOCATE(integrand_prep_J(3,nwi))
       ALLOCATE(real_integrand_prep_J(3,nwi))
       ALLOCATE(imag_integrand_prep_J(3,nwi))
       ALLOCATE(integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
       ALLOCATE(real_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
       ALLOCATE(imag_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
       ALLOCATE(Uc(PLpNL,PLtNL,ipbnl:ipeNL))
       ALLOCATE(C_grid_rhat(3,PLpNL,PLtNL))
       ALLOCATE(Mat1ij(PLpNL,PLtNL,3,3))
       ALLOCATE(Mat2ij(PLpNL,PLtNL,3,3))

         do ip=ipbnl,ipenl
         do i_wi=1,nwi
         do ii=1,3
         integrand_prep_J(ii,i_wi)=cmplx(rand(),rand(),wp)
         integrand_prep_J_ip(ii,i_wi,ip)=integrand_prep_J(ii,i_wi)
         real_integrand_prep_J_ip(ii,i_wi,ip)=real(integrand_prep_J_ip(ii,i_wi,i
p))
         enddo
         enddo
         enddo

         do i_phiC=1,PLpNL
         do i_theC=1,PLtNL
         do jj=1,3
         C_grid_rhat(jj,i_phiC,i_theC)=rand()
         do ii=1,3
         Mat1ij(i_phiC,i_theC,ii,jj)=rand()
         Mat2ij(i_phiC,i_theC,ii,jj)=rand()
         enddo
         enddo
         enddo
         enddo
         do i_wi=1,nwi
         do ii=1,3
         rprime_iwi1(ii,i_wi)=-1+rand()
         enddo
         enddo


 !$acc kernels loop  collapse(3) gang
   do 10 ip=ipbnl,ipenl
   do 11 i_phiC=1,PLpNL
   do 12 i_theC=1,PLtNL

         re_thec=0._wp
         re_phic=0._wp
  !$acc loop vector
   do i_wi=1,nwi

     Rtmp1=real_integrand_prep_J_ip(1,i_wi,ip)
     Rtmp2=real_integrand_prep_J_ip(2,i_wi,ip)
     Rtmp3=real_integrand_prep_J_ip(3,i_wi,ip)
     integrand_J1=Mat2ij(i_phiC,i_theC,1,1)*Rtmp1 + Mat2ij(i_phiC,i_theC,1,2)*Rt
mp2 + Mat2ij(i_phiC,i_theC,1,3)*Rtmp3
     integrand_J2=Mat2ij(i_phiC,i_theC,2,1)*Rtmp1 + Mat2ij(i_phiC,i_theC,2,2)*Rt
mp2 + Mat2ij(i_phiC,i_theC,2,3)*Rtmp3

     scalC=      C_grid_rhat(1,i_phiC,i_theC)*rprime_iwi1(1,i_wi)
     scalC=scalC+C_grid_rhat(2,i_phiC,i_theC)*rprime_iwi1(2,i_wi)
     scalC=scalC+C_grid_rhat(3,i_phiC,i_theC)*rprime_iwi1(3,i_wi)

     integrand_th = integrand_J1*cos(scalc)
     integrand_ph = integrand_J2*cos(scalc)

     rE_theC = rE_theC + integrand_th*wi(i_wi)
     rE_phiC = rE_phiC + integrand_ph*wi(i_wi)

   end do

         !rtmp11=re_thec
         !rtmp12=re_phic

     Uc(i_phiC,i_theC,ip)=Mat2ij(i_phiC,i_theC,1,1)*re_thec + Mat2ij(i_phiC,i_theC,1,2)*re_phic

   12 continue
   11 continue
   10 continue

  print *, Uc(1,1,1)
 END
  • Mat

Hello Mat,
Thanks very much. It seems that your solution works perfectly. I think that I understand better the point of nested loops.
Barak

Hi,
I implemened Mat’s suggestions and here is the current form of the code:

use iso_c_binding
 IMPLICIT NONE
 INTEGER, PARAMETER :: wp=KIND(1.0d0)
 interface
   real(c_double) function rand () bind(C)
      use iso_c_binding
   end function rand
 end interface

     !use dflib
     !use dfport
     !use accel_lib
     !use openacc

integer:: plpnl=41
integer:: pltnl=23
integer:: ipenl=21845
integer:: nwi=64
integer:: ipbnl=5462
integer:: plnl=943
integer:: ii,jj

integer :: ip,ip1
integer :: io
COMPLEX(wp),dimension(:,:),allocatable :: integrand_prep_J
real(wp),dimension(:,:),allocatable :: real_integrand_prep_J
real(wp),dimension(:,:),allocatable :: imag_integrand_prep_J
COMPLEX(wp),dimension(:,:,:),allocatable :: integrand_prep_J_ip
real(wp),dimension(:,:,:),allocatable :: real_integrand_prep_J_ip
real(wp),dimension(:,:,:),allocatable :: imag_integrand_prep_J_ip
REAL(wp) :: Coef
integer :: i_theC,i_phiC
REAL(wp),DIMENSION(3,3) :: matrice,matrice1
REAL(wp),DIMENSION(:), allocatable  :: wi
REAL(wp),DIMENSION(:,:), allocatable  :: rprime_iwi1
REAL(wp),DIMENSION(:,:,:), allocatable  :: C_grid_rhat
REAL(wp),DIMENSION(:,:,:,:), allocatable  :: M12_surf_sph_obs
REAL(wp),DIMENSION(:,:,:,:), allocatable  :: M12_sph_obs_obs

INTEGER :: i_wi
COMPLEX(wp) :: integrand_th,integrand_ph
COMPLEX(wp),DIMENSION(3) :: integrand_prep_J_iwi
real(wp) :: RE_theC,RE_phiC,ie_thec,ie_phic
COMPLEX(wp) :: deph_transl
REAL(wp) :: scalC
real(wp) :: Ru1,Ru2,Ru3,Iu1,Iu2,Iu3
real(wp) :: rtmp1,itmp1
real(wp) :: rtmp2,itmp2
real(wp) :: rtmp3,itmp3
COMPLEX(wp),dimension(:,:,:,:),allocatable :: Uc

integer :: c0, c1, c2, c3
real :: cgpu,  chost
real(wp) ::    r_deph_transl, i_deph_transl


      ALLOCATE(wi(nwi))
      ALLOCATE(rprime_iwi1(3,nwi))
      ALLOCATE(integrand_prep_J(3,nwi))
      ALLOCATE(real_integrand_prep_J(3,nwi))
      ALLOCATE(imag_integrand_prep_J(3,nwi))
      ALLOCATE(integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
      ALLOCATE(real_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
      ALLOCATE(imag_integrand_prep_J_ip(3,nwi,ipbnl:ipeNL))
      ALLOCATE(Uc(3,ipbnl:ipeNL,PLpNL,PLtNL))
      ALLOCATE(C_grid_rhat(3,PLpNL,PLtNL))
      ALLOCATE(M12_surf_sph_obs(PLpNL,PLtNL,3,3))
      ALLOCATE(M12_sph_obs_obs(PLpNL,PLtNL,3,3))

        print*,' memory of uc=',2*3*(ipeNL-ipbnl+1)*PLpNL*PLtNl
        print*,' memory of m12=',3*3*PLpNL*PLtNl
        print*,' memory of M12_sph_obs_obs=',PLpNL*PLtNL*3*3*2

        do ip=ipbnl,ipenl
        do i_wi=1,nwi
        do ii=1,3
        integrand_prep_J(ii,i_wi)=cmplx(rand(),rand(),wp)
        integrand_prep_J_ip(ii,i_wi,ip)=integrand_prep_J(ii,i_wi)
        real_integrand_prep_J_ip(ii,i_wi,ip)=real(integrand_prep_J_ip(ii,i_wi,ip))
        imag_integrand_prep_J_ip(ii,i_wi,ip)=imag(integrand_prep_J_ip(ii,i_wi,ip))
        enddo
        enddo
        enddo

        do i_phiC=1,PLpNL
        do i_theC=1,PLtNL
        do jj=1,3
        C_grid_rhat(jj,i_phiC,i_theC)=rand()
        do ii=1,3
        M12_surf_sph_obs(i_phiC,i_theC,ii,jj)=rand()
        M12_sph_obs_obs(i_phiC,i_theC,ii,jj)=rand()
        enddo
        enddo
        enddo
        enddo
        do i_wi=1,nwi
        do ii=1,3
        rprime_iwi1(ii,i_wi)=-1+rand()
        enddo
        enddo

        call system_clock( count=c1 )
        
!$acc kernels loop  collapse(3) gang
  do 10 ip=ipbnl,ipenl
  do 11 i_phiC=1,PLpNL
  do 12 i_theC=1,PLtNL

        re_thec=0._wp
        ie_thec=0._wp
        re_phic=0._wp
        ie_phic=0._wp

!$acc loop vector
  do i_wi=1,nwi
   Rtmp1=real_integrand_prep_J_ip(1,i_wi,ip)
   itmp1=imag_integrand_prep_J_ip(1,i_wi,ip)
   Rtmp2=real_integrand_prep_J_ip(2,i_wi,ip)
   itmp2=imag_integrand_prep_J_ip(2,i_wi,ip)
   Rtmp3=real_integrand_prep_J_ip(3,i_wi,ip)
   itmp3=imag_integrand_prep_J_ip(3,i_wi,ip)
    Ru1=M12_surf_sph_obs(i_phiC,i_theC,1,1)*Rtmp1 + M12_surf_sph_obs(i_phiC,i_theC,1,2)*Rtmp2 + M12_surf_sph_obs(i_phiC,i_theC,1,3)*Rtmp3
    Iu1=M12_surf_sph_obs(i_phiC,i_theC,1,1)*Itmp1 + M12_surf_sph_obs(i_phiC,i_theC,1,2)*Itmp2 + M12_surf_sph_obs(i_phiC,i_theC,1,3)*Itmp3
    Ru2=M12_surf_sph_obs(i_phiC,i_theC,2,1)*Rtmp1 + M12_surf_sph_obs(i_phiC,i_theC,2,2)*Rtmp2 + M12_surf_sph_obs(i_phiC,i_theC,2,3)*Rtmp3
    Iu2=M12_surf_sph_obs(i_phiC,i_theC,2,1)*Itmp1 + M12_surf_sph_obs(i_phiC,i_theC,2,2)*Itmp2 + M12_surf_sph_obs(i_phiC,i_theC,2,3)*Itmp3
    Ru3=M12_surf_sph_obs(i_phiC,i_theC,3,1)*Rtmp1 + M12_surf_sph_obs(i_phiC,i_theC,3,2)*Rtmp2 + M12_surf_sph_obs(i_phiC,i_theC,3,3)*Rtmp3
    Iu3=M12_surf_sph_obs(i_phiC,i_theC,3,1)*Itmp1 + M12_surf_sph_obs(i_phiC,i_theC,3,2)*Itmp2 + M12_surf_sph_obs(i_phiC,i_theC,3,3)*Itmp3
!
    scalC=      C_grid_rhat(1,i_phiC,i_theC)*rprime_iwi1(1,i_wi)
    scalC=scalC+C_grid_rhat(2,i_phiC,i_theC)*rprime_iwi1(2,i_wi)
    scalC=scalC+C_grid_rhat(3,i_phiC,i_theC)*rprime_iwi1(3,i_wi)
    r_deph_transl=cos(scalc)
    i_deph_transl=sin(scalc)

!
    re_thec=re_thec+(ru1*r_deph_transl-iu1*i_deph_transl)*wi(i_wi)
    ie_thec=ie_thec+(r_deph_transl*iu1+ru1*i_deph_transl)*wi(i_wi)
    re_phic=re_phic+(ru2*r_deph_transl-iu2*i_deph_transl)*wi(i_wi)
    ie_phic=ie_phic+(r_deph_transl*iu2+ru2*i_deph_transl)*wi(i_wi)

  end do

    Ru1=M12_sph_obs_obs(i_phiC,i_theC,1,1)*re_thec + M12_sph_obs_obs(i_phiC,i_theC,1,2)*Re_phic
    Iu1=M12_sph_obs_obs(i_phiC,i_theC,1,1)*Ie_thec + M12_sph_obs_obs(i_phiC,i_theC,1,2)*Ie_phic
    Ru2=M12_sph_obs_obs(i_phiC,i_theC,2,1)*re_thec + M12_sph_obs_obs(i_phiC,i_theC,2,2)*re_phic
    Iu2=M12_sph_obs_obs(i_phiC,i_theC,2,1)*ie_thec + M12_sph_obs_obs(i_phiC,i_theC,2,2)*ie_phic
    Ru3=M12_sph_obs_obs(i_phiC,i_theC,3,1)*re_thec + M12_sph_obs_obs(i_phiC,i_theC,3,2)*re_phic
    Iu3=M12_sph_obs_obs(i_phiC,i_theC,3,1)*ie_thec + M12_sph_obs_obs(i_phiC,i_theC,3,2)*ie_phic

    Uc(1,ip,i_phiC,i_theC) = cmplx(Ru1,Iu1,wp)
    Uc(2,ip,i_phiC,i_theC) = cmplx(Ru2,Iu2,wp)
    Uc(3,ip,i_phiC,i_theC) = cmplx(Ru3,Iu3,wp)

  12 continue
  11 continue
  10 continue
 call system_clock( count=c2 )
        !print*, Uc(1,1,1,1)
cgpu = (c2 - c1)/1e6
        print*,cgpu
END

On my GPU (GTX 480) I get an acceleration of about 50 (with i7 processor).
Is it reasonable? how can I accelerate the code even more?

Thanks,
Barak

I get an acceleration of about 50

50 what? Seconds?

There’s many factors which determine what you should expect. A lot is based on what you’re comparing to (serial, OpenMP, etc.), the problem size (big problems tend to see more speed-up), the algorithm used.

General rule is that 4-5x speed-up is good, but a 1.5x speed-up for a program that takes days to run is still a huge savings. Though, it’s up to you to determine at what point is “good enough”.

  • Mat