And the solution is :
- do the host memory allocation on the fortran side using this:
[codebox]module cuda_alloc
interface
! cudaMallocHost
integer (C_INT) function cudaMallocHost(buffer, size) bind(C,name=“cudaMallocHost”)
use iso_c_binding
implicit none
type (C_PTR) :: buffer
integer (C_LONG), value :: size
end function cudaMallocHost
! cudaFreeHost
integer (C_INT) function cudaFreeHost(buffer) bind(C,name=“cudaFreeHost”)
use iso_c_binding
implicit none
type (C_PTR), value :: buffer
end function cudaFreeHost
end interface
end module cuda_alloc
[/codebox]
- call the cuFFT program using this subroutine :
[codebox]
subroutine r2c_3d( Nx, Ny, Nz, r, c)
use iso_c_binding
use cuda_alloc
implicit none
integer Nx, Ny, Nz,res
real*8 :: r(0:Nx-1, 0:Ny-1, 0:Nz-1)
complex*16 :: c(0:Nx/2, 0:Ny-1, 0:Nz-1)
complex*16 , parameter :: size1=dcmplx(0.0,0.0)
real*8 , parameter :: size2=dble(0.0)
type(C_PTR) :: cptr_cd, cptr_rd
complex*16 , dimension(:,:,:), pointer :: cd
real*8 , dimension(:,:,:), pointer :: rd
res = cudaMallocHost ( cptr_cd,NxNyNz*sizeof(size1)/2+1 )
call c_f_pointer ( cptr_cd,cd , [Nx/2+1,Ny,Nz] )
res = cudaMallocHost ( cptr_rd, NxNyNz*sizeof(size2) )
call c_f_pointer ( cptr_rd,rd , [Nx,Ny,Nz ] )
rd = r
call fft_3d_r2c( Nx, Ny, Nz, rd, cd)
c= cd
res = cudaFreeHost(cptr_rd)
res = cudaFreeHost(cptr_cd)
end subroutine r2c_3d
subroutine c2r_3d(Nx, Ny, Nz, c, r)
use iso_c_binding
use cuda_alloc
implicit none
integer Nx, Ny, Nz,res
real*8 :: r(0:Nx-1, 0:Ny-1, 0:Nz-1)
complex*16 :: c(0:Nx/2, 0:Ny-1, 0:Nz-1)
complex*16 , parameter :: size1=dcmplx(0.0,0.0)
real*8 , parameter :: size2=dble(0.0)
type(C_PTR) :: cptr_cd, cptr_rd
complex*16 , dimension(:,:,:), pointer :: cd
real*8 , dimension(:,:,:), pointer :: rd
res = cudaMallocHost ( cptr_cd,NxNyNz*sizeof(size1) )
call c_f_pointer ( cptr_cd,cd , [Nx/2+1,Ny,Nz] )
res = cudaMallocHost ( cptr_rd, NxNyNz*sizeof(size2) )
call c_f_pointer ( cptr_rd,rd , [Nx,Ny,Nz ] )
cd = c
call fft_3d_c2r( Nx, Ny, Nz, cd ,rd)
r = rd
res = cudaFreeHost(cptr_rd)
res = cudaFreeHost(cptr_cd)
end subroutine c2r_3d
[/codebox]
3 ) and finally this is the .cu code
[codebox]
#include <stdio.h>
#include <stdlib.h>
#include “cufft.h”
#include “cuda.h”
#include “cutil_inline.h”
extern “C” void fft_3d_r2c_( int *nx ,int *ny, int *nz, cufftDoubleReal *h_a, cufftDoubleComplex *h_b)
{
unsigned long size1 = sizeof(cufftDoubleReal)* (nx)(ny)(*nz); // calulate memory size
unsigned long size2 = sizeof(cufftDoubleComplex)* (*nx/2+1)*(*ny)*(*nz); // calulate memory size
cufftDoubleReal *d_ic=NULL; // define pointer to device mem input
cufftDoubleComplex *d_oc=NULL; // define pointer to device mem output
cufftHandle plan;
cutilSafeCall( cudaMalloc((void **)&d_ic, size1)); // allocate mem on device
cutilSafeCall( cudaMalloc((void **)&d_oc, size2)); // allocate mem on device
cutilSafeCall( cudaMemcpy(d_ic,h_a, size1 , cudaMemcpyHostToDevice)); //copy host array to device
cufftSafeCall( cufftPlan3d(&plan, (*nz),(*ny),(*nx) ,CUFFT_D2Z )); // perform 3d fft complex to complex
cufftSafeCall( cufftExecD2Z(plan, d_ic, d_oc )); // out of place
cutilSafeCall( cudaMemcpy(h_b,d_oc, size2 , cudaMemcpyDeviceToHost)); // copy back to host array
cudaFree(d_ic);
cudaFree(d_oc);
cufftDestroy(plan);
}
extern “C” void fft_3d_c2r_( int* nx ,int *ny, int *nz, cufftDoubleComplex *h_a, cufftDoubleReal *h_b)
{
unsigned long size1 = sizeof(cufftDoubleComplex)* (*nx)*(*ny)*(*nz); // calulate memory size
unsigned long size2 = sizeof(cufftDoubleReal)* (*nx)*(*ny)*(*nz); // calulate memory size
cufftDoubleComplex *d_ic=NULL; // define pointer to device mem input
cufftDoubleReal *d_oc=NULL; // define pointer to device mem output
cufftHandle plan;
cutilSafeCall( cudaMalloc((void **)&d_ic, size1)); // allocate mem on device
cutilSafeCall( cudaMalloc((void **)&d_oc, size2)); // allocate mem on device
cutilSafeCall( cudaMemcpy(d_ic,h_a, size1 , cudaMemcpyHostToDevice)); //copy host array to device
cufftSafeCall( cufftPlan3d(&plan, *nz,*ny,*nx ,CUFFT_Z2D )); // perform 3d fft complex to complex
cufftSafeCall( cufftExecZ2D(plan, d_ic, d_oc )); // out of place
cutilSafeCall( cudaMemcpy(h_b,d_oc, size2 , cudaMemcpyDeviceToHost)); // copy back to host array
cudaFree(d_ic);
cudaFree(d_oc);
cufftDestroy(plan);
}
[/codebox]
Note that only the latest gnufortran ( after aug , 2009) would work properly.
It is critical to swap the indices nx and nz in cufftPlan3d.