Thank you for your answer.
I’m using the latest version of Visual Fortran and the CUDA Fortran Compute Capability setting is set to automatic. Could you tell me how to set compute capability manually using command line?
Also, I wrote a simple vector addition code to compare the number of registers in CUDA C and CUDA Fortran. The CUDA C code uses 12 registers and CUDA Fortran uses 16. That’s fine but I found that the code uses 22 registers if I declare device variables under the module ‘vectorDevice’ (so I don’t need to pass those as arguments to the kernel). Below is the code that declares the device variables in the host code.
module vectorDevice
	contains
		attributes(global) subroutine vectorAddKernel(C, A, B, size)
			implicit none
			integer, value :: size
			integer :: i, n
			integer, dimension(:) :: A, B, C
			i = (blockIdx%x - 1) * blockDim%x + threadIdx%x
			if (i .le. size) then
				do n = 1, 20
					C(i) = C(i) + A(i) + B(i)
				end do
			end if
		end subroutine vectorAddKernel
end module vectorDevice
program vectorFortran
	use cudafor
	use vectorDevice
	implicit none
	integer :: inputSize, gridSize, blockSize, n, m, cValue
	logical :: valid
	integer, dimension(:), allocatable :: h_A, h_B, h_C
	integer, device, dimension(:), allocatable :: d_A, d_B, d_C
	inputSize = 100000
	allocate(h_A(inputSize), h_B(inputSize), h_C(inputSize))
	allocate(d_A(inputSize), d_B(inputSize), d_C(inputSize))
	h_A = 1
	h_B = 2
	d_A = h_A
	d_B = h_B
	blockSize = 1024
	gridSize = ceiling(real(inputSize) / blockSize)
	call vectorAddKernel<<<gridSize, blockSize>>>(d_C, d_A, d_B, inputSize)
	h_C = d_C
	valid = .true.
	do n = 1, inputSize
		cValue = 0
		do m = 1, 20
			cValue = cValue + h_A(n) + h_B(n)
		end do
		if (h_C(n) .ne. cValue) then
			valid = .false.
			print *, "Invalid !!!", h_C(n), ":", cValue
			exit
		end if
	end do
	if (valid .eq. .true.) then
		print *, "Valid !!!"
	end if
	deallocate(h_A, h_B, h_C)
	deallocate(d_A, d_B, d_C)
end program vectorFortran
Thank you for your help.