The following code throws
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478
(see full error output below).
kernels3.f90
module kernels3
contains
subroutine outer_wrapper3()
use my_module, only: a, b, c, d
implicit none
!$acc data copy(a), copy(c), copy(b), copy(d)
call wrapper3(a(:, :, :), b(:, :, :), c(:, :, :), d(:, :, :))
!$acc end data
end subroutine
subroutine wrapper3(a, b, c, d)
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10), d(256, 256, 10)
!$acc data present(a), present(c), present(b), present(d)
call add3 (a(:, :, :), b(:, :, :), c(:, :, :))
call mult3 (a(:, :, :), b(:, :, :), d(:, :, :))
!$acc end data
end subroutine
subroutine add3(a,b,c)
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10)
real :: temp(256, 256, 10)
integer :: z
integer(4) :: y, x
!$acc data present(a), present(c), present(b), create(temp)
temp(:, :, :) = 5.0d0
!$acc kernels
!$acc loop independent vector(16)
do y=1,256
!$acc loop independent vector(16)
do x=1,256
!$acc loop seq
do z=1,10
c(x, y, z) = a(x, y, z) + b(x, y, z) + temp(x, y, z)
end do
end do
end do
!$acc end kernels
!$acc end data
end subroutine
subroutine mult3(a,b,d)
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: d(256, 256, 10)
integer :: z
integer(4) :: y, x
!$acc data present(a), present(b), present(d)
!$acc kernels
!$acc loop independent vector(16)
do y=1,256
!$acc loop independent vector(16)
do x=1,256
!$acc loop seq
do z=1,10
d(x, y, z) = a(x, y, z) * b(x, y, z)
end do
end do
end do
!$acc end kernels
!$acc end data
end subroutine
end module kernels3
my_module.f90
module my_module
real, dimension(:, :, :), allocatable :: a, b, c, d
end module
main
...
program main
...
use my_module, only: a, b, c, d
use kernels3, only: outer_wrapper3
implicit none
integer :: x, y, z
integer :: fail_x, fail_y, fail_z
logical test
allocate(a(DOM(NX,NY,NZ)))
allocate(b(DOM(NX,NY,NZ)))
allocate(c(DOM(NX,NY,NZ)))
allocate(d(DOM(NX,NY,NZ)))
a(:,:,:) = 1.0d0
b(:,:,:) = 2.0d0
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
test = .TRUE.
...
call outer_wrapper3()
write(0,*) "calculation3 complete"
do y=1,NY
do x=1,NX
do z=1,NZ
if (test .EQ. .TRUE. .AND. c(AT(x,y,z)) .NE. 8.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
write(0,*) "FAIL"
end if
if (test .EQ. .TRUE. .AND. d(AT(x,y,z)) .NE. 2.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
write(0,*) "FAIL"
end if
end do
end do
end do
...
if (test .EQ. .TRUE.) then
write(0,*) "test ok"
else
write(0,*) "test failed"
write(0,*) "fails at", fail_x, fail_y, fail_z, "C:", c(AT(fail_x,fail_y,fail_z)), "D:", d(AT(fail_x,fail_y,fail_z))
stop 2
end if
deallocate(a)
deallocate(b)
deallocate(c)
deallocate(d)
stop
end program main
compiler output
..........compiling my_module.f90 in /home0/usr4/mueller-m-ab/hybrid/examples/module_data_with_openACC/build/gpu/source
pgf90 -Mcuda=cc3x -fast -ta=nvidia,cc3x -Minline=levels:5,reshape -Mipa=inline,reshape -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -I/usr/local/include -DGPU -c my_module.f90 -o my_module.o
..........compiling kernels3.f90 in /home0/usr4/mueller-m-ab/hybrid/examples/module_data_with_openACC/build/gpu/source
pgf90 -Mcuda=cc3x -fast -ta=nvidia,cc3x -Minline=levels:5,reshape -Mipa=inline,reshape -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -I/usr/local/include -DGPU -c kernels3.f90 -o kernels3.o
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 91)
outer_wrapper3:
89, Generating copy(a(:,:,:),c(:,:,:),b(:,:,:),d(:,:,:))
91, wrapper3 inlined, size=6, file kernels3.f90 (95)
91, Generating present(..inline(:,:,:))
105, add3 inlined, size=27, file kernels3.f90 (110)
91, Generating present(..inline)
Generating create(..inline(:,:,:))
Loop is parallelizable
Accelerator kernel generated
91, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
!$acc loop gang, vector(16) ! blockidx%x threadidx%x
Memory set idiom, loop replaced by call to __c_mset4
106, mult3 inlined, size=14, file kernels3.f90 (137)
91, Generating present(..inline)
Loop is parallelizable
Accelerator kernel generated
91, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
!$acc loop gang, vector(16) ! blockidx%x threadidx%x
0 inform, 0 warnings, 12 severes, 0 fatal for outer_wrapper3
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 105)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. pragma: bad ilmopc 478 (kernels3.f90: 106)
PGF90-S-0000-Internal compiler error. size_of: bad dtype 35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype 35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype 35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype 35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype 35 (kernels3.f90: 108)
PGF90-S-0000-Internal compiler error. size_of: bad dtype 35 (kernels3.f90: 108)
wrapper3:
103, Generating present(a(:,:,:),c(:,:,:),b(:,:,:),d(:,:,:))
105, add3 inlined, size=27, file kernels3.f90 (110)
105, Generating present(a,c,b)
Generating create(..inline(:,:,:))
Loop is parallelizable
Accelerator kernel generated
105, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
!$acc loop gang, vector(16) ! blockidx%x threadidx%x
Memory set idiom, loop replaced by call to __c_mset4
106, mult3 inlined, size=14, file kernels3.f90 (137)
106, Generating present(a,b,d)
Loop is parallelizable
Accelerator kernel generated
106, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
!$acc loop gang, vector(16) ! blockidx%x threadidx%x
0 inform, 0 warnings, 18 severes, 0 fatal for wrapper3
add3:
118, Generating present(a(:,:,:),c(:,:,:),b(:,:,:))
Generating create(temp(:,:,:))
120, Memory set idiom, loop replaced by call to __c_mset4
124, Loop is parallelizable
126, Loop is parallelizable
128, Loop is parallelizable
Accelerator kernel generated
124, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
126, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
mult3:
144, Generating present(a(:,:,:),b(:,:,:),d(:,:,:))
148, Loop is parallelizable
150, Loop is parallelizable
152, Loop is parallelizable
Accelerator kernel generated
148, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
150, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
make[1]: *** [kernels3.o] Error 2
make[1]: Leaving directory `/home0/usr4/mueller-m-ab/hybrid/examples/module_data_with_openACC/build/gpu/source'
make: *** [build_hybrid_gpu] Error 2
Interestingly enough, the following module works, where the data region is applied outside and wrapper2 has present clauses for imported module data instead of module data that’s passed down as subroutine arguments through one layer:
kernels2.f90
module kernels2
contains
subroutine wrapper2()
use my_module, only: a, b, c, d
implicit none
! ****** additional symbols inserted by framework to emulate device support of language features
! ****** end additional symbols
!$acc data present(a), present(c), present(b), present(d)
call add2 (a(:, :, :), b(:, :, :), c(:, :, :))
call mult2 (a(:, :, :), b(:, :, :), d(:, :, :))
!$acc end data
end subroutine
subroutine add2(a,b,c)
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10)
integer :: z
integer(4) :: y, x
!$acc data present(a), present(c), present(b)
!$acc kernels
!$acc loop independent vector(16)
do y=1,256
!$acc loop independent vector(16)
do x=1,256
!$acc loop seq
do z=1,10
c(x, y, z) = a(x, y, z) + b(x, y, z)
end do
end do
end do
!$acc end kernels
!$acc end data
end subroutine
....
I’ve tested kernels3 without the additional temp array and it didn’t make a difference, so I think that can be ignored. The output above has been created by PGI 15.1 (linux version). I’ve also tested in 14.7 (linux) where the compiler prints two ‘bad ilmo’ errors and then it seems to go into an endless loop. It did work in 14.7 when I replaced ‘present’ and ‘create’ clauses with ‘deviceptr’ and used the device attribute for the arrays in the specification within the data region (still using ‘acc copy’ for that data though). This (probably hackish) approach broke in 15.1 however, it basically gave the same output as the one with present/create that you see above. Do you have any idea what’s going on here?