Hi all, I’ve got my cuda (FX Quadro 1700) running in Fedora 8, and now i’m trying to get some evidence of speed up by comparing it with the fft of matlab.
The matlab code and the simple cuda code i use to get the timing are pasted below. Now i’m having problem in observing speedup caused by cuda. Currently when i call the function timing(2048*2048, 6), my output is
CUFFT:
Elapsed time is 1.038155 seconds.
MATLAB FFT:
Elapsed time is 1.596426 seconds.
which doesn’t seem so impressive…
So can anyone point how i can get speedup of maybe 10x in fft code just as mentioned in the white paper in this page? Thanks in advance!
timing.m ( it can be called by issuing command “timing(2048, 4)” as an example )
function [a, b] = timing(datasize, batch)
d = zeros(datasize, batch);
for i = 1:datasize
d(i,:) = i;
end
disp('CUFFT: '); tic; a = mexCUFFT(d); toc;
disp('MATLAB FFT: '); tic; b = fft(d); toc;
mexCUFFT.cu (compiled with the command “$MATLAB_CUDA/nvmex -f $MATLAB_CUDA/nvopts.sh -I/usr/local/cuda/include -L/usr/local/cuda/lib -lcufft -lcudart” where $MATLAB_CUDA is the path of the matlab plugin for cuda)
#include "cufft.h"
#include "cuda.h"
#include "mex.h"
#include "cuda_runtime.h"
void pack_r2c(cufftComplex *output_float,
double *input_re,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = 0.0f;
}
}
void pack_c2c(cufftComplex *output_float,
double *input_re,
double *input_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = input_im[i];
}
}
void unpack_c2c(cufftComplex *input_float,
double *output_re,
double *output_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_re[i] = input_float[i].x;
output_im[i] = input_float[i].y;
}
}
cufftComplex *runfft(cufftComplex *data, int m, int n);
// Program use to calculate the fft for a simple matrix.
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
int m, n;
double *inDataR, *inDataI, *outDataR, *outDataI;
cufftComplex *data;
if( nrhs < 1 ) mexErrMsgTxt( "Input argument not defined." );
m = mxGetM(prhs[0]);
n = mxGetN(prhs[0]);
/* Allocating host memory. */
data = (cufftComplex *)mxMalloc(sizeof(cufftComplex) * n * m);
inDataR = mxGetPr(prhs[0]);
if( mxIsComplex(prhs[0]) )
{
/* If it is a complex data. */
inDataI = mxGetPi(prhs[0]);
pack_c2c( data, inDataR, inDataI, m*n );
}
else
{
/* If it is a real data. */
pack_r2c( data, inDataR, m*n );
}
data = runfft(data, m, n);
plhs[0] = mxCreateDoubleMatrix(m, n, mxCOMPLEX);
outDataR = mxGetPr(plhs[0]);
outDataI = mxGetPi(plhs[0]);
unpack_c2c(data, outDataR, outDataI, n*m);
mxFree(data);
return;
}
cufftComplex *runfft(cufftComplex *data, int m, int n)
{
// Allocate device memory for data
cufftComplex *d_data;
cudaMalloc( (void **)&d_data, sizeof(cufftComplex) * m * n );
// Copy host memory to device
cudaMemcpy(d_data, data, m * n * sizeof(cufftComplex), cudaMemcpyHostToDevice);
// CUFFT plan
cufftHandle plan;
cufftPlan1d(&plan, m, CUFFT_C2C, n);
// FFT execution
cufftExecC2C(plan, (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD);
// Copy result to host
cudaMemcpy(data, d_data, n*m * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
// Clear device memory
cufftDestroy(plan);
cudaFree(d_data);
return data;
}