GPU device routine calculates the sum incorrectly in vector mode

Hello, the following subroutine calculates the sum incorrectly if runs in vector mode. Could you please explain why and suggest solution if there is any.

[ilkhom@t020 MCCC]$ cat seq_vector.f90 
module array 
  implicit none
  real(kind=8), allocatable, dimension(:,:) :: a
!$acc declare create(a)
end module array 

subroutine acc_rout(tmp1)
!$acc routine seq   !vector
  use array 
  implicit none
  integer :: i,j
  real*8, intent(out) :: tmp1
  real*8 :: tmp2
  tmp1=0.d0
!$acc loop reduction(+:tmp1)
  do i=1,3    
    tmp2=0.d0
!$acc loop reduction(+:tmp2)
    do j=1,2
      tmp2=tmp2+a(i,j)
    enddo
    tmp1=tmp1+tmp2
  enddo

end subroutine acc_rout

program test
  use array 
  implicit none
  real(kind=8) :: tmp

  allocate(a(1:3,1:2))
  a(1,1)=1.d0; a(1,2)=2.d0
  a(2,1)=3.d0; a(2,2)=4.d0
  a(3,1)=5.d0; a(3,2)=6.d0

!$acc update device(a)
!$acc parallel copy(tmp) num_gangs(1) vector_length(128) 
  call acc_rout(tmp)
!$acc end parallel 
print*,'tmp=',tmp

end program test
[ilkhom@t020 MCCC]$ nvfortran -acc -Minfo=accel -ta=tesla:cuda11.6 seq_vector.f90 && ./a.out 
acc_rout:
      7, Generating acc routine seq
         Generating NVIDIA GPU code
test:
     37, Generating update device(a(:,:))
     38, Generating copy(tmp) [if not already present]
         Generating NVIDIA GPU code
     39, Reference argument passing prevents parallelization: tmp
 tmp=    21.00000000000000     
[ilkhom@t020 MCCC]$

If I run it in vector mode I am getting 9 instead of 21.

Hi Ilkhom,

Since you’re using F77 style calling, you need to add:

!$acc routine(acc_rout) vector

in the caller’s routine else the caller assumes its sequential. Alternately add an interface to the routine or place it in a module so F90 calling is used.

For example:

% cat seq_vector.f90
module array
  implicit none
  real(kind=8), allocatable, dimension(:,:) :: a
!$acc declare create(a)
end module array

subroutine acc_rout(tmp1)
!$acc routine vector
  use array
  implicit none
  integer :: i,j
  real*8, intent(out) :: tmp1
  real*8 :: tmp2
  tmp1=0.d0
!$acc loop reduction(+:tmp1)
  do i=1,3
    tmp2=0.d0
!$acc loop reduction(+:tmp2)
    do j=1,2
      tmp2=tmp2+a(i,j)
    enddo
    tmp1=tmp1+tmp2
  enddo

end subroutine acc_rout

program test
  use array
  implicit none
  real(kind=8) :: tmp
!$acc routine(acc_rout) vector
  allocate(a(1:3,1:2))
  a(1,1)=1.d0; a(1,2)=2.d0
  a(2,1)=3.d0; a(2,2)=4.d0
  a(3,1)=5.d0; a(3,2)=6.d0

!$acc update device(a)
!$acc parallel copy(tmp) num_gangs(1) vector_length(128)
  call acc_rout(tmp)
!$acc end parallel
print*,'tmp=',tmp

end program test
% nvfortran -acc seq_vector.f90 -Minfo=accel -V22.3 ; a.out
acc_rout:
      7, Generating NVIDIA GPU code
         16, !$acc loop seq
             Generating reduction(+:tmp1)
         19, !$acc loop vector ! threadidx%x
             Generating reduction(+:tmp2)
             Vector barrier inserted for vector loop reduction
     16, Loop is parallelizable
     19, Loop is parallelizable
test:
     37, Generating update device(a(:,:))
     38, Generating copy(tmp) [if not already present]
         Generating NVIDIA GPU code
     39, Reference argument passing prevents parallelization: tmp
 tmp=    21.00000000000000

Hope this helps,
Mat

Thank you very much for such a nice explanation. I would not figure it out myself.
Ilkhom

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.