Hi,
we need to combine OpenACC and CUDA, but we are struggling with the correct passing of device pointers from OpenACC to the CUDA kernel. I created a small example program.
When compiling we get the error message:
PGF90-S-0528-Argument number 2 to daxpy: device attribute mismatch (main.f90: 47)
0 inform, 0 warnings, 1 severes, 0 fatal for testdaxpy
How to correctly pass VecData%vec%vec1 to the CUDA kernel?
Thank you for your help!
my_type.f90
module MyData
implicit none
type vecs
real*8 vec1(1000)
end type vecs
type Dtype
type(vecs) :: Vec
end type Dtype
end module MyData
module MyParams
use MyData
implicit none
type(Dtype) :: VecData
end module MyParams
main.f90
module mathOps
contains
attributes(global) subroutine daxpy(x, y, a)
implicit none
real*8 :: x(:), y(:)
real*8, value :: a
integer :: i, n
n = size(x)
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
!write(*,*)i
if (i <= n) y(i) = y(i) + a*x(i)
end subroutine daxpy
end module mathOps
program testDaxpy
use mpi
use mathOps
use cudafor
use accel_lib
use MyParams
implicit none
integer, parameter :: N = 1000
integer ierr, npe0, iam0, numdevices
real*8 :: x(N), y(N), a
real, device :: x_d(N), y_d(N)
type(dim3) :: grid, tBlock
call MPI_Init(ierr)
write(*,*) "Programstart"
call MPI_Comm_size(mpi_comm_world,npe0,ierr)
call MPI_Comm_rank(mpi_comm_world,iam0,ierr)
numdevices = acc_get_num_devices(acc_device_nvidia)
if(iam0.eq.0)then
write(*,*)"# of GPUs per node:",numdevices
endif
if(numdevices.ne.0)then
call acc_set_device_num(mod(iam0,numdevices),acc_device_nvidia)
endif
tBlock = dim3(32,1,1)
grid = dim3(ceiling(real(N)/tBlock%x),1,1)
x = 1.0; y = 2.0; a = 2.0
!$acc data copyin(x, VecData)
!$acc host_data use_device(x, VecData%vec%vec1)
call daxpy<<<grid, tBlock>>>(x, VecData%vec%vec1, a)
!$acc end host_data
!$acc end data
call MPI_Barrier(mpi_comm_world, ierr)
call MPI_Finalize(ierr)
end program testDaxpy
makefile
all:
mpif90 -acc -Mcuda -ta=tesla,deepcopy my_type.f90 main.f90