I am trying to compile a Fortran program using CUDA and cuBLAS as per an example from the NVIDIA HPC SDK documentation. My setup includes an NVIDIA A100 GPU, and I have configured the CUDA and cuBLAS environment as follows:
export NVHPC_CUDA_HOME=/usr/local/cuda
export PATH=$NVHPC_CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$NVHPC_CUDA_HOME/lib64:$LD_LIBRARY_PATH
The Fortran code I’m trying to compile is:
program testcublas
! compile with pgfortran -ta=tesla -cudalib=cublas -cuda testcublas.f90
call testcu1(1000)
call testcu2(1000)
end
subroutine testcu1(n)
use openacc
use cublas
integer :: a(n), b(n)
type(cublasHandle) :: h
istat = cublasCreate(h)
! Force OpenACC kernels and cuBLAS to use the OpenACC stream.
istat = cublasSetStream(h, acc_get_cuda_stream(acc_async_sync))
!$acc data copyout(a, b)
!$acc kernels
a = 1
b = 2
!$acc end kernels
! No host_data, we are lexically inside a data region
! sswap will accept any kind(4) data type
call sswap(n, a, 1, b, 1)
call cublasSswap(n, a, 1, b, 1)
!$acc end data
if (all(a.eq.1).and.all(b.eq.2)) then
print *, "Test PASSED"
else
print *, "Test FAILED"
endif
end
subroutine testcu2(n)
use openacc
use cublas
real(8) :: a(n), b(n)
a = 1.0d0
b = 2.0d0
!$acc data copy(a, b)
!$acc host_data use_device(a, b)
call dswap(n, a, 1, b, 1)
call cublasDswap(n, a, 1, b, 1)
!$acc end host_data
!$acc end data
if (all(a.eq.1.0d0).and.all(b.eq.2.0d0)) then
print *, "Test PASSED"
else
print *, "Test FAILED"
endif
end
However, I am encountering compilation errors with pgfortran
using the command:
pgfortran -gpu=cc80 -cudalib=cublas -cuda testcublas.f90
The errors are:
NVFORTRAN-S-0155-Could not resolve generic procedure cublassswap (testcublas.f90: 19)
0 inform, 0 warnings, 1 severes, 0 fatal for testcu1
NVFORTRAN-S-0034-Syntax error at or near identifier openacc (testcublas.f90: 27)
NVFORTRAN-S-0034-Syntax error at or near identifier i (testcublas.f90: 29)
NVFORTRAN-S-0310-Adjustable array can not have automatic bounds specifiers - a (testcublas.f90: 29)
0 inform, 0 warnings, 3 severes, 0 fatal for testcu2
It seems that this example, which is directly from the NVIDIA documentation, should compile without issues. Could anyone help me identify what might be going wrong?