Compiler failed to translate accelerator region

Hi all,
When I compiled my program using -Mcuda I got an error ‘Compiler failed to translate accelerator region(see -Minfo messages): Unexpected runtime function call(C:\Program Files\PGI\inverse1.f95:1)’. I want to know why? It’s an error of my program or of PGI? I’m using a version trial 15 days of PGI.
Thank a lot

Hi Camaptrang,

Can post or send to PGI Customer Service (trs@pgroup.com) a reproducing example? I’d need to reproduce the error in order to determine the cause.

Thanks,
Mat

Hi all,
This is my code
module courant
contains
attributes(global) subroutine cal(xi,yi,ri,diff2,n,f)
implicit double precision(a-h,o-z)
real,intent(inout):: f(:)
real,intent(in):: xi(:),yi(:),diff2(:),ri(:)
integer,value,intent(in)::n
!real,value,intent(in)::eps1
integer::ith
integer::m,i,mt,j,k
dimension compt(100,100),te1(100),te2(100),vre(100,100)
dimension compt1(100,100),s(100),U(100,100),t(100,100)
!open(unit=21,file=‘voltagenew.txt’,status= ‘unknown’)
! do i=1,7
! do j=1,8
! read(21,*) U(i,j)
! enddo
! enddo
!close(21)
ith = threadIdx%x
do i=1,7
do j=1,8
U(i,1)=1
U(i,i+1)=-1
if ((j.ne.1).and.(j.ne.(i+1))) then
U(i,j)=0
endif
enddo
enddo

eps=0.02
alpha=0.1
beta=0.05
diff1=1.d0
re=0.9d0
ix=51477
na=16807
nmax=2147483647
pi=4datan(1.d0)
eps1=0.00001d0
do i=1,8
te1(i)=i
Pi/4-alpha
te2(i)=i*Pi/4+alpha
enddo
ratio=diff1/(diff1+diff2(ith))
vieab=0.d0
tot=0.d0
tot2=0.d0
do k=1,8
compt(k,9)=0.d0
enddo
do k=1,8
do i=1,8
compt(k,i)=0.d0
enddo
enddo

do j=1,8
tt=0.d0
do i=1,n
do k=1,8
compt1(j,k)=0.d0
enddo
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
teta=te1(j)+2
aa*alpha
x=dcos(teta)
y=dsin(teta)

vie=1.d0
r1=1.d0-dsqrt(x2+y2)
r2=dabs(ri(ith)-dsqrt((x-xi(ith))2+(y-yi(ith))2))
r3=dabs(re-dsqrt(xx+yy))
do while(vie.ge.0.5d0)
if ((r1.ge.eps1).and.(r3.ge.eps1).and.(r2.ge.eps1)) then
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
teta=2
aa*pi
if ( dsqrt(x
2+y
2).le.re) then
r=min(r2,r3)
x=x+rdcos(teta)
y=y+r
dsin(teta)
else
r=min(r1,r3)
x=x+rdcos(teta)
y=y+r
dsin(teta)
endif
else
d1=dsqrt(x2+y2)
xn=x/d1
teta1=dacos(xn)
if (y.le.0.d0) then
teta1=2pi-dacos(xn)
endif
if (r1.le.eps1) then
if (((teta1.ge.2
pi-alpha).or.(teta1.le.alpha)).or. &
((teta1.ge.te1(1)).and.(teta1.le.te2(1))).or. &
((teta1.ge.te1(2)).and.(teta1.le.te2(2))).or. &
((teta1.ge.te1(3)).and.(teta1.le.te2(3))).or. &
((teta1.ge.te1(4)).and.(teta1.le.te2(4))).or. &
((teta1.ge.te1(5)).and.(teta1.le.te2(5))).or. &
((teta1.ge.te1(6)).and.(teta1.le.te2(6))).or. &
((teta1.ge.te1(7)).and.(teta1.le.te2(7))) &
) then
if ((teta1.ge.2pi-alpha).or.(teta1.le.alpha)) then
tt=tt+eps
U1/(eps+beta)
compt(j,8)= compt(j,8)+1
compt1(j,8)= compt1(j,8)+1
endif
do ii=1,7
if ((teta1.ge.te1(ii)).and.(teta1.le.te2(ii))) then
tt=tt+((-1)**ii)epsU1/(eps+beta)
compt(j,ii)=compt(j,ii)+1
compt1(j,ii)=compt1(j,ii)+1
endif
enddo
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
te11=beta/(eps+beta)
if (aa.ge.te11) then
vie=0.d0
else
vie=1.d0
xa=1-eps
ix=abs(ix
na)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
if (aa.le.0.5d0) then
ya=-eps
else
ya=+eps
endif
x=xadcos(teta1)-yadsin(teta1)
y=yadcos(teta1)+xadsin(teta1)
endif
else
xa=1-eps
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
if (aa.le.0.5d0) then
ya=-eps
else
ya=+eps
endif
x=xa
dcos(teta1)-yadsin(teta1)
y=ya
dcos(teta1)+xadsin(teta1)
endif
endif
if (r3.le.eps1) then
ix=abs(ix
na)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
ix=abs(ixna)
ix=mod(ix,nmax)
bb=dble(ix)/dble(nmax)
if (ratio.ge.bb) then
if (aa.le.0.5d0) then
xa=re+eps
ya=eps
else
xa=re+eps
ya=-eps
endif
endif
if (ratio.le.bb) then
if (aa.ge.0.5d0) then
xa=re-eps
ya=-eps
else
xa=re-eps
ya=eps
endif
endif
x=xa
dcos(teta1)-yadsin(teta1)
y=ya
dcos(teta1)+xa*dsin(teta1)

endif
if (r2.le.eps1) then
vie=0.d0
vieab=vieab+1
compt(j,9)=compt(j,9)+1
endif
endif
r1=1.d0-dsqrt(xx+yy)
r2=dabs(ri(ith)-dsqrt((x-xi(ith))**2+(y-yi(ith))**2))
r3= dabs(re-dsqrt(xx+yy))
enddo
enddo
compt(j,9)=compt(j,9)/n
enddo

do k=1,8
do i=1,8
compt(k,i)=compt(k,i)/n
enddo
enddo
do m=1,7
s(m)=0.d0
do k=1,8
do i=1,8
s(m)=s(m)+eps/(eps+beta)*U(m,i)compt(k,i)
enddo
enddo
s(m)=s(m)/8
s(m)=-s(m)
enddo
do m=1,7
do i=1,8
t(m,i)=0.d0
do j=1,8
t(m,i)=t(m,i)+eps/(eps+beta)U(m,j)compt(i,j)
enddo
t(m,i)=alpha
t(m,i)
t(m,i)=alpha
U(m,i)-t(m,i)-alpha
s(m)
enddo
enddo
f(ith)=0.d0
do mt=1,7
do j=1,8
f(ith)=f(ith)+(t(mt,j)-vre(mt,j))**2
enddo
enddo
return
end subroutine cal
end module courant
program main
use cudafor
use courant
implicit double precision(a-h,o-z)
integer, parameter::np=100,n=10000
real,dimension(np)::xi,yi,ri,diff2
real,device,dimension(np) ::xi_d,yi_d,ri_d,diff2_d,f_d
real,dimension(np)::f
dimension x(1000),y(1000),z(1000),a(100),b(100),c(100),v(20)
dimension vx(1000),vy(1000),vz(1000),va(1000),vb(1000)
dimension sx(1000),sy(1000),sz(1000),sa(1000),sb(1000)
integer::k,i,l,kt,m
al=0.9
m=10
ix=51477
na=16807
nmax=2147483647
sxu=0.1d0
syu=0.1d0
szu=1/12d0
sau=1/3d0

vxu=0.3d0
vyu=0.3d0
vzu=0.5d0
vau=1.d0


test=0.000001d0
!open(unit=10,file=‘data1.txt’,status= ‘unknown’)
!do j=1,10
! read(10,) v(j)
! enddo
!close(10)
! n=dint(v(10)/8)
k=1.d0
vx(k)=vxu
vy(k)=vyu
vz(k)=vzu
va(k)=vau
sx(k)=sxu
sy(k)=syu
sz(k)=szu
sa(k)=sau
sb(k)=sbu
do while((sxu.gt.test).and.(syu.gt.test).and.(szu.gt.test) .and.(sau.gt.test).and.(k.lt.10))
k=k+1
! if (k.lt.8) then
!n=n
2
!eps1=eps1/2
!else
! n=n
!endif
tx=dsqrt(12sxu)
ty=dsqrt(12
syu)
tz=dsqrt(12szu)
ta=dsqrt(12
sau)

cx=vxu-tx/2.d0
cy=vyu-ty/2.d0
cz=vzu-tz/2.d0
ca=vau-ta/2.d0

do i=1,np
ix=abs(ixna)
ix=mod(ix,nmax)
aa=dble(ix)/dble(nmax)
x(i)=cx+tx
aa
xi_d=x(i)
ix=abs(ixna)
ix=mod(ix,nmax)
bb=dble(ix)/dble(nmax)
y(i)=cy+ty
bb
yi_d=y(i)
ix=abs(ixna)
ix=mod(ix,nmax)
cc=dble(ix)/dble(nmax)
z(i)=cz+tz
cc
ri_d=z(i)
ix=abs(ixna)
ix=mod(ix,nmax)
cc=dble(ix)/dble(nmax)
c(i)=ca+ta
cc
diff2_d=c(i)
enddo
!call increment(a,b)
call cal<<<1,np>>>(xi_d,yi_d,ri_d,diff2_d,n,f_d)
i=np
f=f_d
do while (i.gt.np-m)
l=1
do j=1,i
if (f(j).le.f(l)) then
l=j
endif
enddo

H=f(l)
f(l)=f(i)
f(i)=H
P=x(l)
x(l)=x(i)
x(i)=P
Q=y(l)
y(l)=y(i)
y(i)=Q
Q1=z(l)
z(l)=z(i)
z(i)=Q1
Q2=a(l)
a(l)=a(i)
a(i)=Q2

i=i-1

do kt=np-m+1,np
print*,f(kt),x(kt),y(kt),z(kt),a(kt)
enddo

tt1=0.d0
tt2=0.d0
tt5=0.d0
tta=0.d0
ttb=0.d0
tt3=0.d0
tt4=0.d0
tt6=0.d0
tta1=0.d0
ttb1=0.d0

do i=np-m+1,np
tt1=tt1+x(i)
tt2=tt2+y(i)
tt5=tt5+z(i)
tta=tta+a(i)

tt3=tt3+x(i)**2
tt4=tt4+y(i)**2
tt6=tt6+z(i)**2
tta1=tta1+a(i)**2

enddo
vx(k)=tt1/m
vy(k)=tt2/m
vz(k)=tt5/m
va(k)=tta/m

sx(k)=tt3/m-vx(k)*vx(k)
sy(k)=tt4/m-vy(k)*vy(k)
sz(k)=tt6/m-vz(k)*vz(k)
sa(k)=tta1/m-va(k)*va(k)

vxu=al*vx(k)+(1-al)vx(k-1)
vyu=al
vy(k)+(1-al)vy(k-1)
vzu=al
vz(k)+(1-al)vz(k-1)
vau=al
va(k)+(1-al)*va(k-1)

sxu=alalsx(k)+(1-al)**2sx(k-1)
syu=al
alsy(k)+(1-al)**2sy(k-1)
szu=alalsz(k)+(1-al)**2sz(k-1)
sau=al
alsa(k)+(1-al)**2sa(k-1)
enddo
enddo
end program main
Please see the red part
Thank a lot

Hi Camaptrang,

Try compiling with “-Mcuda=cc20”.

It appears to me that the back-end NVIDIA compiler is choking on using a double precision variable as the while loop conditional variable when targeting a compute capable 1.3 device. The work around is to specifically target a later device (i.e. cc20 or cc30) or update your code to use an integer instead of “vie” in the while loop.

r3=dabs(re-dsqrt(x*x+y*y)) 
do while(vie.ge.0.5d0)   <<< Change this to an integer variable
  if ((r1.ge.eps1).and.(r3.ge.eps1).and.(r2.ge.eps1)) then
  • Mat

Thank you so much,
I tried with -Mcuda=cc20 but i got the results which are 0.00000. I tried with another exemple that I compiled with -Mcuda. With -Mcuda it gave the exact result but with -Mcuda=cc20, it gave 0.00000. I used also implicit double precision for second exemple. I don’t know why?
Thank of avance

What’s the compute capability of your device (see the output from the pgaccelinfo utility)?

You may also want to add in error checking after your kernel call to see if the kernel is failing.

call cal<<<1,np>>>(xi_d,yi_d,ri_d,diff2_d,n,f_d)
istat = cudaThreadSynchronize()
istat = cudaGetLastError()
if (istat.ne.0) then
  print *, cudaGetErrorString(istat)
endif
  • Mat

hi Mat,
I did like you tolk. I got “invalid device function”. What that mean?
Thank

It’s most likey that your device doesn’t support double precision. What is your compute capability? (which can be found via the utility pgaccelinfo).

  • Mat

Hi Mat,
I use GeForce 310M. Cuda version: 5050
PGI Compiler Option -ta=nvidia,cc12. what is that mean ‘cc12’?I must use what decalration for the real number?
thank of avance.
Giang

Hi Giang,

Yes, your card doesn’t support double precision floating point. This was added to devices with compute capability 1.3 (cc13), but your card is CC 1.2.

For real numbers use “REAL” (without the -r8 flag) or “REAL*4”.

Also, make sure you aren’t using double precision constant values either (0.0D).

  • Mat