I have a problem with the following piece of code.
The program calls a subroutine that computes the FFT of the input using the cuFFT library.
The code is successfully compiled, the output however is an empty vector.
If I do everything in the same program (without subroutine), the program works fine.
Looks like a problem of memory synchronization.
I compile using:
nvfortran -fast -acc -gpu=managed -Minfo=accel fftx_sub.f90 -o fftx_sub -L/usr/local/cuda/lib64 -lcufft
program test
use, intrinsic :: iso_c_binding
implicit none
integer(c_int), parameter :: nx=32,fpz=33,fpy=32
real(c_double) :: u(nx,fpz,fpy)
complex(c_double_complex) :: uc(nx/2+1,fpz,fpy)
double precision :: dx
integer :: i
dx=6.28/(nx-1)
do i=1,nx+1
u(i,:,:)=sin((i-1)*dx)
enddodo i=1,nx
write(,) “f(x)”, u(i,1,1)
enddocall computefftx(u,uc)
end programsubroutine computefftx(u,uc)
use, intrinsic :: iso_c_binding
use cufft
use openacc
integer(c_int) :: inembed(3),onembed(3)
integer(c_int) :: nx,npz,npy
integer :: cudaplan_x_fwd,cudaplan_x_bwd
real(c_double) :: u(nx,npz,npy)
complex(c_double_complex) :: uc(nx/2+1,npz,npy)
integer :: istride,ostride,idist,odist
integer :: dims(1),inx=32
npz=32
npy=32
inembed=[nx,npz,npy]
onembed=[nx/2+1,npz,npy]
istride=1
ostride=1
idist=nx
odist=nx/2+1
gerr=0
dims(1)=nxgerr=gerr+cufftPlanMany(cudaplan_x_fwd,1,dims,inembed,istride,idist,onembed,ostride,odist,CUFFT_D2Z,npz*npy)
do i=1,nx
write(,) “f(x)”, u(i,1,1)
enddo!$acc data copyin(u) copyout(uc)
gerr=gerr+cufftSetStream(cudaplan_x_fwd,acc_get_cuda_stream(acc_async_sync))
!$acc host_data use_device(u,uc)
gerr=gerr+cufftExecD2Z(cudaplan_x_fwd,u,uc)
!$acc end host_data
!$acc end data
write(,) “gerr”, gerrdo i=1,nx/2
write(,) “spectral f(x)”, uc(i,1,1)
enddoreturn
end