How can I get good performance from cuFFT?


I need to move some calculations to the GPU where I will compute a batch of 32 2D FFTs each having size 600 x 600. When I compare the performance of cufft with matlab gpu fft, then cufft is much! slower, typically a factor 10 (when I have removed all overhead from things like plan creation). How is this possible? Is this what to expect from cufft or is there any way to speed up cufft? (I would simple use matlabs fft if I could but when I mix it up with some iffts, sums and element wise multiplications it becomes super-slow in an unpredictable way.)

// The core of my code

mwSize ndim = mxGPUGetNumberOfDimensions(C_q);    
mwSize const * dimSize = mxGPUGetDimensions(C_q);

// FFT test
cufftHandle plan;
int dd[3];
dd[1] = (int)dimSize[0];
dd[0] = (int)dimSize[1];
dd[2] = (int)dimSize[2];

int Nq = dd[2];
dimSize = mxGPUGetDimensions(Phi_j);
int L = dimSize[2];

// OBS quite some overhead here. Use default settings for the memory layout. Seem to give the right    answer. Ok?
cufftPlanMany(&plan, 2, dd, NULL,0,0,NULL,0,0,CUFFT_C2C,Nq);

// Loop and sum over singular values
for (int i = 0; i<L; i++)
    // Do the fft
    cufftExecC2C(plan,(cufftComplex *) pS_q,(cufftComplex *) pC_q,CUFFT_FORWARD);


/ Anders

An update with a full example for someone to test:

% Matlab side code

% Compile using:
% >> mexcuda -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\lib\x64" -lcufft

A = gpuArray.randn(600,600,32,‘single’) + 1i*randn(600,600,32,‘single’);

B = abc(A);

tic, for ii = 1:30, B = fft2(A); end; toc

AA = gather(A);
tic, for ii = 1:30, B = fft2(AA); end; toc

%% Output from a run

Elapsed time is 0.193155 seconds. % Mex file
Elapsed time is 0.004172 seconds. % Matlab fft2
Elapsed time is 1.455618 seconds. % Matlab CPU

// Mex-file code in the file

#include “mex.h”
#include “gpu/mxGPUArray.h”
#include <cufft.h>

// Interal type for complex. Same as cufftComplex just another name
typedef float2 Complex;


  • Device code

void mexFunction(int nlhs, mxArray *plhs,
int nrhs, mxArray const *prhs)

char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";

/* Declare all variables.*/
mxGPUArray const *A;
mxGPUArray *B;

Complex const *pA;

Complex *pB;

/* Initialize the MathWorks GPU API. */

/* Throw an error if the input is not a GPU array. */
if (nrhs!=1) {
    mexErrMsgIdAndTxt(errId, errMsg);

 for (int ii = 0; ii<1; ii++)
        if (!(mxIsGPUArray(prhs[ii])))
            mexErrMsgIdAndTxt(errId, errMsg);

A = mxGPUCreateFromMxArray(prhs[0]);

 // Verify that input is single arrays before extracting the pointer.
if (mxGPUGetClassID(A) != mxSINGLE_CLASS ) 
    mexErrMsgIdAndTxt(errId, errMsg);

/* Get the pointer to the data */
pA = (Complex const *)(mxGPUGetDataReadOnly(A));

/* Create a GPUArray to hold the result and get its underlying pointer. */
B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),

pB = (Complex *)(mxGPUGetData(B));

// Now we can do work!  
mwSize const * dimSize = mxGPUGetDimensions(A);

// FFT test
cufftHandle plan;
int dd[2];
dd[1] = (int) dimSize[1];
dd[0] = (int) dimSize[0];

int Nq = (int) dimSize[2];
int L = 30;

cufftPlanMany(&plan, 2, dd, NULL,0,0,NULL,0,0,CUFFT_C2C,Nq);
for (int i = 0; i<L; i++)
    // Do the fft
    cufftExecC2C(plan,(cufftComplex *) pA,(cufftComplex *) pB,CUFFT_FORWARD);

/* Wrap the result up as a MATLAB gpuArray for return. */
plhs[0] = mxGPUCreateMxArrayOnGPU(B);

// Free resources



why are you doing the same FFT L times in a row?

can you do it as a batch instead?

what GPU are you running this on?