Matlab cuFFT


I have difficulties to figure out (from some examples) how a CUDA is working when the data are already in the GPU. I am interested in performing cuFFT with matlab but how am I supposed to change cudaMemcpy, cudaMalloc, mxCreateNumericMatrix, etc…from the following example ? Any help ?

#include “mex.h”
#include <cuda_runtime.h>
#include <cufft.h>

void mexFunction(int nlhs, mxArray *plhs, int nrhs, mxArray *prhs)
if (nrhs != 1)
mexErrMsgTxt(“Invaid number of input arguments”);

if (!mxIsSingle(prhs[0]) && !mxIsSingle(prhs[1]))
    mexErrMsgTxt("input data type must be single");

float* A = (float*)mxGetData(prhs[0]);

int numARows = mxGetM(prhs[0]);
int numACols = mxGetN(prhs[0]);

float *deviceA;

cudaMalloc(&deviceA, sizeof(float) * numARows * numACols);
cudaMemcpy(deviceA, A, numARows * numACols * sizeof(float),

int outRows = numARows /2 + 1;
int outCols = numACols;
cufftComplex* deviceOut;
cudaMalloc(&deviceOut, sizeof(cufftComplex) * outRows * outCols);

cufftHandle plan;
cufftPlan2d(&plan, numACols, numARows, CUFFT_R2C);
cufftExecR2C(plan, deviceA, deviceOut);

float* out = (float*)mxMalloc(sizeof(cufftComplex) * outRows * outCols);
cudaMemcpy(out, deviceOut, outRows * outCols * sizeof(cufftComplex),

plhs[0] = mxCreateNumericMatrix(outRows, outCols, mxSINGLE_CLASS, mxCOMPLEX);
float* real = (float*)mxGetPr(plhs[0]);
float* imag = (float*)mxGetPi(plhs[0]);
float* complex = out;
for (int c = 0; c < outCols; ++c)
    for (int r = 0; r < outRows; ++r)
        *real++ = *complex++;
        *imag++ = *complex++;



Merci, Nicolas.