Hi pwautelet90876,
Unfortunately some CUDA versions of these routines are not stringently IEEE 754 compliant. They’re close, but as in this case, may be slightly different. See: http://www.pgroup.com/resources/accel.htm#ieee
We have had requests, mostly from the Weather community, to have bit-exact versions that are comparable to 64-bit x86, but are still in the process of investigating how to accomplish this.
For your code, I do see that most answers are bit-exact with a few off by one bit. All results are within 15 significant digits.
% cat power.f90
PROGRAM power
IMPLICIT NONE
INTEGER,PARAMETER :: NX=10, NY=10, NZ=1
REAL(KIND=8),DIMENSION(NX,NY,NZ) :: tab1,tab2,tab3
integer x,y,z
REAL(KIND=8) :: diff
integer :: cnt
CALL random_number(tab1)
!$acc kernels
tab2(:,:,:) = tab1(:,:,:) ** (1d0/3d0)
!$acc end kernels
tab3(:,:,:) = tab1(:,:,:) ** (1d0/3d0)
cnt = 0
do x=1,NX
do y=1,NY
do z=1,NZ
diff = abs(tab3(x,y,z)-tab2(x,y,z))
if (diff .ne. 0) then
cnt = cnt + 1
print '(a5,i3,a3,i3,a3,i3)', "== x:",x," y:", y, " z:", z
print '(a7,f26.24,a3,z16,a1)', "GPU: ", tab2(x,y,z),' (',tab2(x,y,z),')'
print '(a7,f26.24,a3,z16,a1)', "CPU: ", tab3(x,y,z),' (',tab3(x,y,z),')'
print '(a7,g16.8)', "DIFF: ", diff
endif
enddo
enddo
enddo
print *, "Total non-bitexact answers: ", cnt, "out of ", NX*NY*NZ
END PROGRAM power
% pgfortran -Kieee -Mnofma -Mvect=nosimd -g -O2 -ta=host,tesla,nofma,cuda7.5,cc50 power.f90 -o power.out -Minfo
power:
11, Generating implicit copyout(tab2(:,:,:))
Generating implicit copyin(tab1(:,:,:))
12, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
12, !$acc loop gang, vector(4) ! blockidx%z threadidx%y
!$acc loop gang, vector(32) ! blockidx%x threadidx%x
!$acc loop gang ! blockidx%y
12, Loop not vectorized/parallelized: loop count too small
Loop not vectorized: vector forms of operators do not exist: DPOWD
14, Loop not vectorized/parallelized: loop count too small
Loop not vectorized: vector forms of operators do not exist: DPOWD
19, Loop not vectorized/parallelized: contains call
% ./power.out
== x: 3 y: 6 z: 1
GPU: 0.509752278793706992132684 (3FE04FE402CF5FD0)
CPU: 0.509752278793706881110381 (3FE04FE402CF5FCF)
DIFF: 0.11102230E-15
== x: 4 y: 5 z: 1
GPU: 0.909107148171787282642242 (3FED1767DFBEA517)
CPU: 0.909107148171787171619940 (3FED1767DFBEA516)
DIFF: 0.11102230E-15
== x: 4 y: 9 z: 1
GPU: 0.582758422494587424722567 (3FE2A5F4FDC2A654)
CPU: 0.582758422494587535744870 (3FE2A5F4FDC2A655)
DIFF: 0.11102230E-15
== x: 9 y: 1 z: 1
GPU: 0.684451546327035531547267 (3FE5E706EDE57ADA)
CPU: 0.684451546327035642569570 (3FE5E706EDE57ADB)
DIFF: 0.11102230E-15
== x: 9 y: 5 z: 1
GPU: 0.446024670902884889933659 (3FDC8BAB0FAF2BE2)
CPU: 0.446024670902884834422508 (3FDC8BAB0FAF2BE1)
DIFF: 0.55511151E-16
== x: 10 y: 7 z: 1
GPU: 0.839168612099823185701553 (3FEADA7822198CE8)
CPU: 0.839168612099823296723855 (3FEADA7822198CE9)
DIFF: 0.11102230E-15
Total non-bitexact answers: 6 out of 100