Hi,
I am testing my CUDA Fortran code that calls a device code containing cuRAND_device subroutines. I would like to use 2D thread mapping, and each of the thread is given a random seed. I tried with different numbers of thread-per-block. Sometimes the device code seemed to be completely ignored and no random numbers were assigned to the array. It may happen when thread-per-block was larger and equal than 27. This did not seem to happen when I only used 1D thread/block structure.
The code was compiled with the following command:
pgf90 -Mcuda=nollvm -Mcudalib=curand test.cuf -o test.exe
It can be run with the command:
> ./test.exe 50 10 32
I also tried what combination can cause the issue and I noticed that if I used -Mcuda=nollvm, this issue appears. Are there any constraints that keep NoLLVM from having more thread-per-block for a 2D structure? I did not seem to have the other way around since device cuRAND codes need NoLLVM to pass the compilation.
Some of the properties are as follows:
Device Name: Tesla M2090
Device Revision Number: 2.0
Global Memory Size: 6442123264
Total Constant Memory: 65536
Total Shared Memory per Block: 49152
Registers per Block: 32768
Warp Size: 32
Maximum Threads per Block: 1024
Maximum Block Dimensions: 1024, 1024, 64
Maximum Grid Dimensions: 65535 x 65535 x 65535
The code is attached at the end of the message.
Thanks a lot for your help!
Jimmy
! RNG device module
module rng_kernel
use curand_device
contains
attributes(global) subroutine rngenerator( a )
real, device :: a(:), b(:)
type(curandStateXORWOW) :: h
integer(kind=8), parameter :: i8 = 8
integer(kind=8) :: seed, seq, offset, ix, iy, iam, n
! 2D thread-block structure, 1 random seed per thread
n = size(a)
seq = 0_i8; offset = 0_i8
ix = ThreadIdx%x + (BlockIdx%x - 1_i8) * BlockDim%x
iy = ThreadIdx%y + (BlockIdx%y - 1_i8) * BlockDim%y
iam = ix + (iy - 1_i8) * BlockDim%x * GridDim%x
seed = iam*2_i8 + 5678_i8
call curand_init(seed, seq, offset, h)
if (iam <= n) then
a(iam) = curand_uniform(h)
write(*,*) iam, a(iam)
end if
return
end subroutine rngenerator
end module rng_kernel
program test
use rng_kernel
use cudafor
implicit none
real, allocatable, device :: a(:)
real, allocatable :: c(:)
real :: rmean
integer(kind=4) :: i
integer(kind=8) :: n, nB=0, tPB=0
logical :: passing
character(len=100) :: fname
character(len=100) :: arg
type(dim3) :: griddims, blockdims
! Use-input parameters for total number of thread, griddim, and blockdim
if (command_argument_count() < 1) then
call show_help()
stop
end if
do i = 1, command_argument_count()
call get_command_argument(i, arg)
!write(*,*) i, arg
if (i .eq. 1) read(arg,'(i)') n
if (i .eq. 2) read(arg,'(i)') nB
if (i .eq. 3) read(arg,'(i)') tPB
end do
if (tPB == 0) tPB = 64
if (nB == 0) nB = ceiling(sqrt(real(n))/real(tPB))
griddims = dim3(nB,nB,1)
blockdims = dim3(tPB,tPB,1)
allocate(a(n))
allocate(c(n))
a = 0.0
passing = .true.
call rngenerator<<<griddims,blockdims>>> (a) ! Calling device code !
c = a
write(*,*) 'Number of threads needed: ', n
write(*,*) 'Total number of threads generated:', nB*nB*tPB*tPB
rmean = sum(c)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
passing = .false.
print *,"Mean is ",rmean," which fails"
else
print *,"Mean is ",rmean," which passes"
contains
subroutine show_help()
print *, 'usage: trand_uniform.exe nB tPB N'
print *, ''
print *, ' - parameters: '
print *, ' N: total number of the random number needed'
print *, ' nB: number of blocks (not utilized in method 1)'
print *, ' tPB: number of threads per block'
print *, ''
end subroutine show_help
end program test