Hello,
I sent the following email to trs@pgroup.com someting like 2 weeks ago but I got no feedback. Maybe it was not the correct address.
NB: I removed some parts of the reproducing example below, because I do not want to make all of it public.
The reproducing example is cut from a much larger code, so the naming of the variables, and consistancy in that naming may look strange.
Hello,
Upgrading from 11.1 to 11.4 version of the pgf90 compiler, I get different result for my cuda code. My specifically, 11.4 version with -O1 optimization gives the same results as version 11.1 with either -O1 or -O2, but 11.4 with -O2 gives a different result. I managed to produce a rather short reproducing example (150 lines, with the relevant part of only a few lines). please find the code at the end of the email.
The outputs I get are:
[semelin@tucana]$ /opt/pgi/linux86-64/11.4/bin/pgf90 -Mcuda=cc13 -O2 test_rng.cuf
[semelin@tucana licorice_lyman_alpha_gpu_pgf90]$ ./a.out
0.3758509 2.361541 0.3758509 1.821720
-0.6886148 0.6812911 -0.2482983
[semelin@tucana]$ /opt/pgi/linux86-64/11.4/bin/pgf90 -Mcuda=cc13 -O1 test_rng.cuf
[semelin@tucana]$ ./a.out
0.3758509 2.361541 0.9046696 0.6277700
-0.4175274 0.4130869 0.8093393
In the test_rng subroutine, if I change:real :: r,phi1,theta1,r1
i=(blockidx%x-1) * blockdim%x+threadidx%x
call rng(r)
debug_val_gpu(1,i)=r
phi1=r2.03.141592653589
debug_val_gpu(2,i)=phi1
call rng(r)
debug_val_gpu(3,i)=r
theta1=acos(r*2.0-1.0)for
real :: r,phi1,theta1,r1
i=(blockidx%x-1) * blockdim%x+threadidx%x
call rng(r)
debug_val_gpu(1,i)=r
phi1=r2.03.141592653589
debug_val_gpu(2,i)=phi1
call rng(r1)
debug_val_gpu(3,i)=r1
theta1=acos(r1*2.0-1.0)then the difference disappears!
Here is the complete reproducing example. Please do not dsistrubute it more than necessary, the rng is original implementation.Please aknowledge that you could reproduce this behavior, and let me know what you will do about it.
Benoit Semelin.
MODULE VARS
integer, parameter :: GRID_SIZE=30, WARP_SIZE=32, block_size=WARP_SIZE8
integer, parameter :: nb_thread=GRID_SIZEblock_size
integer, save, dimension(nb_thread,0:31) :: state_rng
integer, save, dimension(nb_thread) :: state_i
END MODULE VARSMODULE KERNEL
USE CUDAFORINTEGER, PARAMETER :: NB_GPU_BLOCKS_GPU=30
INTEGER, PARAMETER :: NB_THREAD_PER_GPU_BLOCK_GPU=256
INTEGER, PARAMETER :: WARP_SIZE_GPU=32
INTEGER, PARAMETER :: WARP_NB_GPU=NB_THREAD_PER_GPU_BLOCK_GPU/WARP_SIZE_GPU
INTEGER, PARAMETER :: NB_CONCURENT_PHOT_GPU=NB_GPU_BLOCKS_GPU*NB_THREAD_PER_GPU_BLOCK_GPUINTEGER,device, dimension(NB_CONCURENT_PHOT_GPU) :: debug_val_int_gpu
real(KIND=4),device, dimension(4,NB_CONCURENT_PHOT_GPU) :: debug_val_gpureal(KIND=4),device, dimension(3,NB_CONCURENT_PHOT_GPU) :: dir_gpu
integer, device, dimension(NB_CONCURENT_PHOT_GPU) :: state_i_gpu
integer, device, dimension(NB_CONCURENT_PHOT_GPU,0:31) :: state_rng_gpuCONTAINS
!!!
ATTRIBUTES(GLOBAL) SUBROUTINE TEST_RNG()
implicit none
integer :: i
real :: r,phi1,theta1,r1i=(blockidx%x-1) * blockdim%x+threadidx%x
call rng(r)
debug_val_gpu(1,i)=r
phi1=r2.03.141592653589
debug_val_gpu(2,i)=phi1
call rng(r)
debug_val_gpu(3,i)=r
theta1=acos(r*2.0-1.0)
debug_val_gpu(4,i)=theta1
dir_gpu(1,i)=cos(phi1)*sin(theta1)
dir_gpu(2,i)=sin(phi1)*sin(theta1)
dir_gpu(3,i)=cos(theta1)END SUBROUTINE TEST_RNG
!!!
ATTRIBUTES(DEVICE) SUBROUTINE RNG(x)
IMPLICIT NONE/removed/
END SUBROUTINE RNG
END MODULE KERNEL
!!!SUBROUTINE INIT_WELL(seed)
USE VARS
integer, intent(in) :: seed
integer :: st
integer :: j,ist=seed
do i=1,nb_thread
do j=0,32-1
state_rng(i,j)=st
st=1812433253 * IEOR(st, ISHFT(st,-30)) + j + 1 ! Low level RNG
enddo
enddostate_i=0
END SUBROUTINE INIT_WELL
!!!
PROGRAM test
USE VARS
USE KERNEL
USE CUDAFOR!implicit none
integer :: i,j,err
integer(KIND=8) :: long
double precision :: x,y,t1,t2
real, dimension(nb_thread) :: result
type(dim3) :: dimGrid, dimBlock
integer, dimension(nb_thread) :: debug_val_int
real, dimension(4,nb_thread) :: debug_val
real(KIND=4), dimension(3,NB_CONCURENT_PHOT_GPU) :: dir
CALL INIT_WELL(213847876)
state_i=0state_rng_gpu=state_rng
state_i_gpu=state_idimGrid = dim3( GRID_SIZE, 1, 1 )
dimBlock = dim3( block_size, 1, 1 )call TEST_RNG<<<dimGrid, dimBlock>>>()
err=cudathreadsynchronize()
dir=dir_gpu
debug_val=debug_val_gpu
write(,) debug_val(1:4,1)
write(,) dir(1:3,1)
END PROGRAM