Happy New Year, everyone!
According to the CUDA specs (CUDA C Programming Guide | PDF | Parallel Computing | Thread (Computing)), atomic operations can be performed on 64-bit integers since CC 1.2. However, when I try to compile the following code with select device integers declared as “integer(kind=8)” I get the error shown in triplicate below:
PGF90-S-0155-Could not resolve generic procedure atomicadd (watchdog_test.f90:23)
PGF90-S-0155-Could not resolve generic procedure atomicadd (watchdog_test.f90:24)
PGF90-S-0155-Could not resolve generic procedure atomicadd (watchdog_test.f90:25)
The code follows. Switching the “kind=4” variables to “kind=8” is sufficient to trigger the error on my machine.
! ***************************************************************************
! ***************************************************************************
module gpu_kernel
integer, device :: num_loops
integer(kind=4), device :: i, j, k
contains
! ***************************************************************************
attributes(global) subroutine cuda_kernel()
use cudafor
implicit none
! Local variables
integer :: n
integer(kind=4) :: o
real :: temp
do n = 1, num_loops
o = atomicadd(i, 1)
o = atomicadd(j, 1)
o = atomicadd(k, 1)
enddo
return
end subroutine
! ***************************************************************************
subroutine cuda_device_select()
use cudafor
implicit none
! Local variables
integer :: cudastat, nDevices, i, desired_device
real :: max_version, version
type(cudadeviceprop) :: prop
cudastat = cudaGetDeviceCount(nDevices)
max_version = 0.0
do i = 0, nDevices-1
cudastat = cudaGetDeviceProperties(prop, i)
version = float(prop%major) + 0.1*float(prop%minor)
if(version .gt. max_version) then
max_version = version
desired_device = i
endif
enddo
cudastat = cudaSetDevice(desired_device)
cudastat = cudaGetDeviceProperties(prop, desired_device)
write(*,"(' Device ',a,' selected.')") trim(prop%name)
write(*,"(' CUDA compute capability is ',i0,'.',i0)") prop%major, &!&
prop%minor
return
end subroutine
end module
! ***************************************************************************
! ***************************************************************************
program watchdog_test
use cudafor
use gpu_kernel
implicit none
! Local variables
integer(kind=4) :: m
real :: temp
i = 0; j = 0; k = 0
call cuda_device_select()
num_loops = 20000
call cuda_kernel<<<16,256>>>()
m = i; write(*,"(i0)") m
m = j; write(*,"(i0)") m
m = k; write(*,"(i0)") m
stop
end program
Am I doing something wrong, or is this a difference between CUDA Fortran and the official specs for CUDA?
Edit to add: If anyone reading this knows a slicker way to select a GPU, I’m all ears. The documentation for cudaChooseDevice didn’t make a lot of sense to me.