I have a problem with the following piece of code.

The program calls a subroutine that computes the FFT of the input using the cuFFT library.

The code is successfully compiled, the output however is an empty vector.

If I do everything in the same program (without subroutine), the program works fine.

Looks like a problem of memory synchronization.

I compile using:

nvfortran -fast -acc -gpu=managed -Minfo=accel fftx_sub.f90 -o fftx_sub -L/usr/local/cuda/lib64 -lcufft

program test

use, intrinsic :: iso_c_binding

implicit none

integer(c_int), parameter :: nx=32,fpz=33,fpy=32

real(c_double) :: u(nx,fpz,fpy)

complex(c_double_complex) :: uc(nx/2+1,fpz,fpy)

double precision :: dx

integer :: i

dx=6.28/(nx-1)

do i=1,nx+1

u(i,:,:)=sin((i-1)*dx)

enddodo i=1,nx

write(,) “f(x)”, u(i,1,1)

enddocall computefftx(u,uc)

end programsubroutine computefftx(u,uc)

use, intrinsic :: iso_c_binding

use cufft

use openacc

integer(c_int) :: inembed(3),onembed(3)

integer(c_int) :: nx,npz,npy

integer :: cudaplan_x_fwd,cudaplan_x_bwd

real(c_double) :: u(nx,npz,npy)

complex(c_double_complex) :: uc(nx/2+1,npz,npy)

integer :: istride,ostride,idist,odist

integer :: dims(1),inx=32

npz=32

npy=32

inembed=[nx,npz,npy]

onembed=[nx/2+1,npz,npy]

istride=1

ostride=1

idist=nx

odist=nx/2+1

gerr=0

dims(1)=nxgerr=gerr+cufftPlanMany(cudaplan_x_fwd,1,dims,inembed,istride,idist,onembed,ostride,odist,CUFFT_D2Z,npz*npy)

do i=1,nx

write(,) “f(x)”, u(i,1,1)

enddo!$acc data copyin(u) copyout(uc)

gerr=gerr+cufftSetStream(cudaplan_x_fwd,acc_get_cuda_stream(acc_async_sync))

!$acc host_data use_device(u,uc)

gerr=gerr+cufftExecD2Z(cudaplan_x_fwd,u,uc)

!$acc end host_data

!$acc end data

write(,) “gerr”, gerrdo i=1,nx/2

write(,) “spectral f(x)”, uc(i,1,1)

enddoreturn

end