Hello,
Can anyone help me with this.
Old Code: Inside fortran
call sfftw_plan_dft_3d(plan,n1,n2,n3,cx,cx,ifset,64)
call sfftw_execute (plan)
call sfftw_destroy_plan (plan)
New Code: Inside Fortran
call tempfft(n1,n2,n3,cx,direction)
tempfft.cu
#include <stdio.h>
#include <cufft.h>
#include <cutil.h>
#include <cuComplex.h>
#include “cuda.h”
extern “C” void tempfft_(int *n1, int *n2, int *n3,cufftComplex *data, int direction)
{
int Nx = *n1;
int Ny = *n2;
int Nz = *n3;
cufftComplex *d_data;
CUT_DEVICE_INIT();
CUDA_SAFE_CALL(cudaMalloc((void**) &d_data, sizeof(cufftComplex)*Nx*Ny*Nz));
CUDA_SAFE_CALL(cudaMemcpy(d_data, data, Nx*Ny*Nz*sizeof(cufftComplex), cudaMemcpyHostToDevice));
cufftHandle plan1;
CUDA_SAFE_CALL(cufftPlan3d(&plan1, Nz, Ny, Nx, CUFFT_C2C));
if(direction<0)
CUDA_SAFE_CALL(cufftExecC2C(plan1, (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD));
else
CUDA_SAFE_CALL(cufftExecC2C(plan1, (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_INVERSE));
CUDA_SAFE_CALL(cudaMemcpy(data, d_data, Nx*Ny*Nz*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cufftDestroy(plan1));
cudaFree(d_data);
return;
}
When I simulate the above codes inside a big FORTRAN Application
FFTW code takes about 21 minutes for each step while the CUDA code is taking about 66 minutes for each step.
a) Is there any way I can increase the performance ?
Thanks