compiler error. pragma: bad ilmopc

The following code throws

PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478

(see full error output below).

kernels3.f90

module kernels3
contains
 subroutine outer_wrapper3()
  use my_module, only: a, b, c, d
  implicit none

!$acc data copy(a), copy(c), copy(b), copy(d)

  call wrapper3(a(:, :, :), b(:, :, :), c(:, :, :), d(:, :, :))
!$acc end data
end subroutine

 subroutine wrapper3(a, b, c, d)
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: c(256, 256, 10), d(256, 256, 10)

!$acc data present(a), present(c), present(b), present(d)

  call add3 (a(:, :, :), b(:, :, :), c(:, :, :))
  call mult3 (a(:, :, :), b(:, :, :), d(:, :, :))
!$acc end data
end subroutine

 subroutine add3(a,b,c)
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: c(256, 256, 10)
  real :: temp(256, 256, 10)
  integer :: z

  integer(4) :: y, x
!$acc data present(a), present(c), present(b), create(temp)

  temp(:, :, :) = 5.0d0

!$acc kernels
!$acc loop independent vector(16)
  do y=1,256
!$acc loop independent vector(16)
   do x=1,256
!$acc loop seq
    do z=1,10
     c(x, y, z) = a(x, y, z) + b(x, y, z) + temp(x, y, z)
    end do
   end do
  end do
!$acc end kernels
!$acc end data
end subroutine

 subroutine mult3(a,b,d)
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: d(256, 256, 10)
  integer :: z

  integer(4) :: y, x
!$acc data present(a), present(b), present(d)

!$acc kernels
!$acc loop independent vector(16)
  do y=1,256
!$acc loop independent vector(16)
   do x=1,256
!$acc loop seq
    do z=1,10
     d(x, y, z) = a(x, y, z) * b(x, y, z)
    end do
   end do
  end do
!$acc end kernels
!$acc end data
end subroutine

end module kernels3

my_module.f90

module my_module
real, dimension(:, :, :), allocatable :: a, b, c, d
end module

main

...

program main
...
use my_module, only: a, b, c, d
use kernels3, only: outer_wrapper3
implicit none
integer :: x, y, z
integer :: fail_x, fail_y, fail_z
logical test

allocate(a(DOM(NX,NY,NZ)))
allocate(b(DOM(NX,NY,NZ)))
allocate(c(DOM(NX,NY,NZ)))
allocate(d(DOM(NX,NY,NZ)))

a(:,:,:) = 1.0d0
b(:,:,:) = 2.0d0
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
test = .TRUE.

...

call outer_wrapper3()
write(0,*) "calculation3 complete"

do y=1,NY
	do x=1,NX
		do z=1,NZ
			if (test .EQ. .TRUE. .AND. c(AT(x,y,z)) .NE. 8.0d0) then
			test = .FALSE.
			fail_x = x
			fail_y = y
			fail_z = z
			write(0,*) "FAIL"
			end if
			if (test .EQ. .TRUE. .AND. d(AT(x,y,z)) .NE. 2.0d0) then
			test = .FALSE.
			fail_x = x
			fail_y = y
			fail_z = z
			write(0,*) "FAIL"
			end if
		end do
	end do
end do

...

if (test .EQ. .TRUE.) then
write(0,*) "test ok"
else
write(0,*) "test failed"
write(0,*) "fails at", fail_x, fail_y, fail_z, "C:", c(AT(fail_x,fail_y,fail_z)), "D:", d(AT(fail_x,fail_y,fail_z))
stop 2
end if

deallocate(a)
deallocate(b)
deallocate(c)
deallocate(d)

stop
end program main

compiler output

..........compiling my_module.f90 in /home0/usr4/mueller-m-ab/hybrid/examples/module_data_with_openACC/build/gpu/source
pgf90 -Mcuda=cc3x -fast -ta=nvidia,cc3x -Minline=levels:5,reshape -Mipa=inline,reshape -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -I/usr/local/include -DGPU -c my_module.f90 -o my_module.o
..........compiling kernels3.f90 in /home0/usr4/mueller-m-ab/hybrid/examples/module_data_with_openACC/build/gpu/source
pgf90 -Mcuda=cc3x -fast -ta=nvidia,cc3x -Minline=levels:5,reshape -Mipa=inline,reshape -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -I/usr/local/include -DGPU -c kernels3.f90 -o kernels3.o
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 91)
outer_wrapper3:
     89, Generating copy(a(:,:,:),c(:,:,:),b(:,:,:),d(:,:,:))
     91, wrapper3 inlined, size=6, file kernels3.f90 (95)
          91, Generating present(..inline(:,:,:))
         105, add3 inlined, size=27, file kernels3.f90 (110)
               91, Generating present(..inline)
                   Generating create(..inline(:,:,:))
                   Loop is parallelizable
                   Accelerator kernel generated
                   91, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
                       !$acc loop gang, vector(16) ! blockidx%x threadidx%x
                   Memory set idiom, loop replaced by call to __c_mset4
         106, mult3 inlined, size=14, file kernels3.f90 (137)
               91, Generating present(..inline)
                   Loop is parallelizable
                   Accelerator kernel generated
                   91, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
                       !$acc loop gang, vector(16) ! blockidx%x threadidx%x
  0 inform,   0 warnings,  12 severes, 0 fatal for outer_wrapper3
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc     478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. size_of: bad dtype       35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype       35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype       35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype       35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype       35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype       35 (kernels3.f90: 108)
wrapper3:
    103, Generating present(a(:,:,:),c(:,:,:),b(:,:,:),d(:,:,:))
    105, add3 inlined, size=27, file kernels3.f90 (110)
         105, Generating present(a,c,b)
              Generating create(..inline(:,:,:))
              Loop is parallelizable
              Accelerator kernel generated
             105, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
                  !$acc loop gang, vector(16) ! blockidx%x threadidx%x
              Memory set idiom, loop replaced by call to __c_mset4
    106, mult3 inlined, size=14, file kernels3.f90 (137)
         106, Generating present(a,b,d)
              Loop is parallelizable
              Accelerator kernel generated
             106, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
                  !$acc loop gang, vector(16) ! blockidx%x threadidx%x
  0 inform,   0 warnings,  18 severes, 0 fatal for wrapper3
add3:
    118, Generating present(a(:,:,:),c(:,:,:),b(:,:,:))
         Generating create(temp(:,:,:))
    120, Memory set idiom, loop replaced by call to __c_mset4
    124, Loop is parallelizable
    126, Loop is parallelizable
    128, Loop is parallelizable
         Accelerator kernel generated
        124, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
        126, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
mult3:
    144, Generating present(a(:,:,:),b(:,:,:),d(:,:,:))
    148, Loop is parallelizable
    150, Loop is parallelizable
    152, Loop is parallelizable
         Accelerator kernel generated
        148, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
        150, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
make[1]: *** [kernels3.o] Error 2
make[1]: Leaving directory `/home0/usr4/mueller-m-ab/hybrid/examples/module_data_with_openACC/build/gpu/source'
make: *** [build_hybrid_gpu] Error 2

Interestingly enough, the following module works, where the data region is applied outside and wrapper2 has present clauses for imported module data instead of module data that’s passed down as subroutine arguments through one layer:

kernels2.f90

module kernels2
contains
 subroutine wrapper2()
  use my_module, only: a, b, c, d
  implicit none

! ****** additional symbols inserted by framework to emulate device support of language features
! ****** end additional symbols

!$acc data present(a), present(c), present(b), present(d)

  call add2 (a(:, :, :), b(:, :, :), c(:, :, :))
  call mult2 (a(:, :, :), b(:, :, :), d(:, :, :))
!$acc end data
end subroutine

 subroutine add2(a,b,c)
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: c(256, 256, 10)
  integer :: z

  integer(4) :: y, x
!$acc data present(a), present(c), present(b)

!$acc kernels
!$acc loop independent vector(16)
  do y=1,256
!$acc loop independent vector(16)
   do x=1,256
!$acc loop seq
    do z=1,10
     c(x, y, z) = a(x, y, z) + b(x, y, z)
    end do
   end do
  end do
!$acc end kernels
!$acc end data
end subroutine
....

I’ve tested kernels3 without the additional temp array and it didn’t make a difference, so I think that can be ignored. The output above has been created by PGI 15.1 (linux version). I’ve also tested in 14.7 (linux) where the compiler prints two ‘bad ilmo’ errors and then it seems to go into an endless loop. It did work in 14.7 when I replaced ‘present’ and ‘create’ clauses with ‘deviceptr’ and used the device attribute for the arrays in the specification within the data region (still using ‘acc copy’ for that data though). This (probably hackish) approach broke in 15.1 however, it basically gave the same output as the one with present/create that you see above. Do you have any idea what’s going on here?

Hi Mike,

This looks like a compiler issue when inlining a subroutine containing OpenACC directives. Specifically, it looks to be a problem with how the array arguments are being presented. I’ve added a problem report (TPR#21365) and sent it to engineering.

I was able to work around the compiler error by changing the calls from:

 call add3 (a(:, :, :), b(:, :, :), c(:, :, :))
 call mult3 (a(:, :, :), b(:, :, :), d(:, :, :))

to

 call add3 (a, b, c)
 call mult3 (a, b, d)

Please give it a try and let me know. As second work around would be to not use inlining.

Thanks!
Mat

Thanks a ton, Mat, the workaround works. I’d rather not disable inlining since this pattern is being used heavily in a weather framework I’m currently porting, and I wouldn’t want to give up that many optimizations ;).

Starting with 19.10, this is no longer an issue.