Compiler failed to translate accelerator region

camaptrang · June 18, 2013, 7:36am

Hi all,
When I compiled my program using -Mcuda I got an error ‘Compiler failed to translate accelerator region(see -Minfo messages): Unexpected runtime function call(C:\Program Files\PGI\inverse1.f95:1)’. I want to know why? It’s an error of my program or of PGI? I’m using a version trial 15 days of PGI.
Thank a lot

MatColgrove · June 18, 2013, 4:22pm

Hi Camaptrang,

Can post or send to PGI Customer Service (trs@pgroup.com) a reproducing example? I’d need to reproduce the error in order to determine the cause.

Thanks,
Mat

camaptrang · June 20, 2013, 2:02pm

Hi all,
This is my code
module courant
contains
attributes(global) subroutine cal(xi,yi,ri,diff2,n,f)
implicit double precision(a-h,o-z)
real,intent(inout):: f(:)
real,intent(in):: xi(:),yi(:),diff2(:),ri(:)
integer,value,intent(in)::n
!real,value,intent(in)::eps1
integer::ith
integer::m,i,mt,j,k
dimension compt(100,100),te1(100),te2(100),vre(100,100)
dimension compt1(100,100),s(100),U(100,100),t(100,100)
!open(unit=21,file=‘voltagenew.txt’,status= ‘unknown’)
! do i=1,7
! do j=1,8
! read(21,*) U(i,j)
! enddo
! enddo
!close(21)
ith = threadIdx%x
do i=1,7
do j=1,8
U(i,1)=1
U(i,i+1)=-1
if ((j.ne.1).and.(j.ne.(i+1))) then
U(i,j)=0
endif
enddo
enddo

eps=0.02
alpha=0.1
beta=0.05
diff1=1.d0
re=0.9d0
ix=51477
na=16807
nmax=2147483647
pi=4datan(1.d0)
eps1=0.00001d0
do i=1,8
te1(i)=iPi/4-alpha
te2(i)=i*Pi/4+alpha
enddo
ratio=diff1/(diff1+diff2(ith))
vieab=0.d0
tot=0.d0
tot2=0.d0
do k=1,8
compt(k,9)=0.d0
enddo
do k=1,8
do i=1,8
compt(k,i)=0.d0
enddo
enddo

do j=1,8
tt=0.d0
do i=1,n
do k=1,8
compt1(j,k)=0.d0
enddo
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
teta=te1(j)+2aa*alpha
x=dcos(teta)
y=dsin(teta)

vie=1.d0
r1=1.d0-dsqrt(x2+y2)
r2=dabs(ri(ith)-dsqrt((x-xi(ith))2+(y-yi(ith))2))
r3=dabs(re-dsqrt(xx+yy))
do while(vie.ge.0.5d0)
if ((r1.ge.eps1).and.(r3.ge.eps1).and.(r2.ge.eps1)) then
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
teta=2aa*pi
if ( dsqrt(x2+y2).le.re) then
r=min(r2,r3)
x=x+rdcos(teta)
y=y+rdsin(teta)
else
r=min(r1,r3)
x=x+rdcos(teta)
y=y+rdsin(teta)
endif
else
d1=dsqrt(x2+y2)
xn=x/d1
teta1=dacos(xn)
if (y.le.0.d0) then
teta1=2pi-dacos(xn)
endif
if (r1.le.eps1) then
if (((teta1.ge.2pi-alpha).or.(teta1.le.alpha)).or. &
((teta1.ge.te1(1)).and.(teta1.le.te2(1))).or. &
((teta1.ge.te1(2)).and.(teta1.le.te2(2))).or. &
((teta1.ge.te1(3)).and.(teta1.le.te2(3))).or. &
((teta1.ge.te1(4)).and.(teta1.le.te2(4))).or. &
((teta1.ge.te1(5)).and.(teta1.le.te2(5))).or. &
((teta1.ge.te1(6)).and.(teta1.le.te2(6))).or. &
((teta1.ge.te1(7)).and.(teta1.le.te2(7))) &
) then
if ((teta1.ge.2pi-alpha).or.(teta1.le.alpha)) then
tt=tt+epsU1/(eps+beta)
compt(j,8)= compt(j,8)+1
compt1(j,8)= compt1(j,8)+1
endif
do ii=1,7
if ((teta1.ge.te1(ii)).and.(teta1.le.te2(ii))) then
tt=tt+((-1)**ii)epsU1/(eps+beta)
compt(j,ii)=compt(j,ii)+1
compt1(j,ii)=compt1(j,ii)+1
endif
enddo
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
te11=beta/(eps+beta)
if (aa.ge.te11) then
vie=0.d0
else
vie=1.d0
xa=1-eps
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
if (aa.le.0.5d0) then
ya=-eps
else
ya=+eps
endif
x=xadcos(teta1)-yadsin(teta1)
y=yadcos(teta1)+xadsin(teta1)
endif
else
xa=1-eps
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
if (aa.le.0.5d0) then
ya=-eps
else
ya=+eps
endif
x=xadcos(teta1)-yadsin(teta1)
y=yadcos(teta1)+xadsin(teta1)
endif
endif
if (r3.le.eps1) then
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
ix=abs(ixna)
ix=mod(ix,nmax)
bb=dble(ix)/dble(nmax)
if (ratio.ge.bb) then
if (aa.le.0.5d0) then
xa=re+eps
ya=eps
else
xa=re+eps
ya=-eps
endif
endif
if (ratio.le.bb) then
if (aa.ge.0.5d0) then
xa=re-eps
ya=-eps
else
xa=re-eps
ya=eps
endif
endif
x=xadcos(teta1)-yadsin(teta1)
y=yadcos(teta1)+xa*dsin(teta1)

endif
if (r2.le.eps1) then
vie=0.d0
vieab=vieab+1
compt(j,9)=compt(j,9)+1
endif
endif
r1=1.d0-dsqrt(xx+yy)
r2=dabs(ri(ith)-dsqrt((x-xi(ith))**2+(y-yi(ith))**2))
r3= dabs(re-dsqrt(xx+yy))
enddo
enddo
compt(j,9)=compt(j,9)/n
enddo

do k=1,8
do i=1,8
compt(k,i)=compt(k,i)/n
enddo
enddo
do m=1,7
s(m)=0.d0
do k=1,8
do i=1,8
s(m)=s(m)+eps/(eps+beta)*U(m,i)compt(k,i)
enddo
enddo
s(m)=s(m)/8
s(m)=-s(m)
enddo
do m=1,7
do i=1,8
t(m,i)=0.d0
do j=1,8
t(m,i)=t(m,i)+eps/(eps+beta)U(m,j)compt(i,j)
enddo
t(m,i)=alphat(m,i)
t(m,i)=alphaU(m,i)-t(m,i)-alphas(m)
enddo
enddo
f(ith)=0.d0
do mt=1,7
do j=1,8
f(ith)=f(ith)+(t(mt,j)-vre(mt,j))**2
enddo
enddo
return
end subroutine cal
end module courant
program main
use cudafor
use courant
implicit double precision(a-h,o-z)
integer, parameter::np=100,n=10000
real,dimension(np)::xi,yi,ri,diff2
real,device,dimension(np) ::xi_d,yi_d,ri_d,diff2_d,f_d
real,dimension(np)::f
dimension x(1000),y(1000),z(1000),a(100),b(100),c(100),v(20)
dimension vx(1000),vy(1000),vz(1000),va(1000),vb(1000)
dimension sx(1000),sy(1000),sz(1000),sa(1000),sb(1000)
integer::k,i,l,kt,m
al=0.9
m=10
ix=51477
na=16807
nmax=2147483647
sxu=0.1d0
syu=0.1d0
szu=1/12d0
sau=1/3d0

vxu=0.3d0
vyu=0.3d0
vzu=0.5d0
vau=1.d0

test=0.000001d0
!open(unit=10,file=‘data1.txt’,status= ‘unknown’)
!do j=1,10
! read(10,) v(j)
! enddo
!close(10)
! n=dint(v(10)/8)
k=1.d0
vx(k)=vxu
vy(k)=vyu
vz(k)=vzu
va(k)=vau
sx(k)=sxu
sy(k)=syu
sz(k)=szu
sa(k)=sau
sb(k)=sbu
do while((sxu.gt.test).and.(syu.gt.test).and.(szu.gt.test) .and.(sau.gt.test).and.(k.lt.10))
k=k+1
! if (k.lt.8) then
!n=n2
!eps1=eps1/2
!else
! n=n
!endif
tx=dsqrt(12sxu)
ty=dsqrt(12syu)
tz=dsqrt(12szu)
ta=dsqrt(12sau)

cx=vxu-tx/2.d0
cy=vyu-ty/2.d0
cz=vzu-tz/2.d0
ca=vau-ta/2.d0

do i=1,np
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
x(i)=cx+txaa
xi_d=x(i)
ix=abs(ixna)
ix=mod(ix,nmax)
bb=dble(ix)/dble(nmax)
y(i)=cy+tybb
yi_d=y(i)
ix=abs(ixna)
ix=mod(ix,nmax)
cc=dble(ix)/dble(nmax)
z(i)=cz+tzcc
ri_d=z(i)
ix=abs(ixna)
ix=mod(ix,nmax)
cc=dble(ix)/dble(nmax)
c(i)=ca+tacc
diff2_d=c(i)
enddo
!call increment(a,b)
call cal<<<1,np>>>(xi_d,yi_d,ri_d,diff2_d,n,f_d)
i=np
f=f_d
do while (i.gt.np-m)
l=1
do j=1,i
if (f(j).le.f(l)) then
l=j
endif
enddo

H=f(l)
f(l)=f(i)
f(i)=H
P=x(l)
x(l)=x(i)
x(i)=P
Q=y(l)
y(l)=y(i)
y(i)=Q
Q1=z(l)
z(l)=z(i)
z(i)=Q1
Q2=a(l)
a(l)=a(i)
a(i)=Q2

i=i-1

do kt=np-m+1,np
print*,f(kt),x(kt),y(kt),z(kt),a(kt)
enddo

tt1=0.d0
tt2=0.d0
tt5=0.d0
tta=0.d0
ttb=0.d0
tt3=0.d0
tt4=0.d0
tt6=0.d0
tta1=0.d0
ttb1=0.d0

do i=np-m+1,np
tt1=tt1+x(i)
tt2=tt2+y(i)
tt5=tt5+z(i)
tta=tta+a(i)

tt3=tt3+x(i)**2
tt4=tt4+y(i)**2
tt6=tt6+z(i)**2
tta1=tta1+a(i)**2

enddo
vx(k)=tt1/m
vy(k)=tt2/m
vz(k)=tt5/m
va(k)=tta/m

sx(k)=tt3/m-vx(k)*vx(k)
sy(k)=tt4/m-vy(k)*vy(k)
sz(k)=tt6/m-vz(k)*vz(k)
sa(k)=tta1/m-va(k)*va(k)

vxu=al*vx(k)+(1-al)vx(k-1)
vyu=alvy(k)+(1-al)vy(k-1)
vzu=alvz(k)+(1-al)vz(k-1)
vau=alva(k)+(1-al)*va(k-1)

sxu=alalsx(k)+(1-al)**2sx(k-1)
syu=alalsy(k)+(1-al)**2sy(k-1)
szu=alalsz(k)+(1-al)**2sz(k-1)
sau=alalsa(k)+(1-al)**2sa(k-1)
enddo
enddo
end program main
Please see the red part
Thank a lot

MatColgrove · June 20, 2013, 6:37pm

Hi Camaptrang,

Try compiling with “-Mcuda=cc20”.

It appears to me that the back-end NVIDIA compiler is choking on using a double precision variable as the while loop conditional variable when targeting a compute capable 1.3 device. The work around is to specifically target a later device (i.e. cc20 or cc30) or update your code to use an integer instead of “vie” in the while loop.

r3=dabs(re-dsqrt(x*x+y*y)) 
do while(vie.ge.0.5d0)   <<< Change this to an integer variable
  if ((r1.ge.eps1).and.(r3.ge.eps1).and.(r2.ge.eps1)) then

Mat

camaptrang · June 21, 2013, 8:20am

Thank you so much,
I tried with -Mcuda=cc20 but i got the results which are 0.00000. I tried with another exemple that I compiled with -Mcuda. With -Mcuda it gave the exact result but with -Mcuda=cc20, it gave 0.00000. I used also implicit double precision for second exemple. I don’t know why?
Thank of avance

MatColgrove · June 21, 2013, 4:46pm

What’s the compute capability of your device (see the output from the pgaccelinfo utility)?

You may also want to add in error checking after your kernel call to see if the kernel is failing.

call cal<<<1,np>>>(xi_d,yi_d,ri_d,diff2_d,n,f_d)
istat = cudaThreadSynchronize()
istat = cudaGetLastError()
if (istat.ne.0) then
  print *, cudaGetErrorString(istat)
endif

Mat

camaptrang · June 24, 2013, 9:49am

hi Mat,
I did like you tolk. I got “invalid device function”. What that mean?
Thank

MatColgrove · June 24, 2013, 11:25pm

It’s most likey that your device doesn’t support double precision. What is your compute capability? (which can be found via the utility pgaccelinfo).

Mat

camaptrang · June 25, 2013, 11:53am

Hi Mat,
I use GeForce 310M. Cuda version: 5050
PGI Compiler Option -ta=nvidia,cc12. what is that mean ‘cc12’?I must use what decalration for the real number?
thank of avance.
Giang

MatColgrove · June 26, 2013, 4:42pm

Hi Giang,

Yes, your card doesn’t support double precision floating point. This was added to devices with compute capability 1.3 (cc13), but your card is CC 1.2.

For real numbers use “REAL” (without the -r8 flag) or “REAL*4”.

Also, make sure you aren’t using double precision constant values either (0.0D).

Mat

Topic		Replies	Views
First try compile errors Legacy PGI Compilers	15	14345	August 29, 2013
compilation of device function with cuda fortran Legacy PGI Compilers	4	4257	April 9, 2016
double precision Legacy PGI Compilers	6	18641	July 15, 2009
Problem loading data to shared memory Legacy PGI Compilers	12	6738	December 24, 2010
PGF90-W-0155-Compiler failed ... with PGI 12.4 Legacy PGI Compilers	17	11275	August 30, 2012
Accelerator Fatal Error: No NVIDIA/CUDA version... Legacy PGI Compilers	12	14654	May 15, 2017
Signal 11 when compiling for profiling Legacy PGI Compilers	15	15584	September 10, 2015
compiler ask acc routine information for internal function Legacy PGI Compilers	12	20312	October 25, 2017
PGI 12.9: PGF90-W-0155-Compiler failed ... Load of NULL symb Legacy PGI Compilers	1	2217	September 25, 2012
FATAL ERROR at run time Legacy PGI Compilers	5	8115	December 18, 2014

Compiler failed to translate accelerator region

Related topics