I’ve been timing some basic GPU code that simply executes an fft using cuda events, and also timing the same code using the cuda profiler. I’m getting very different results for the two methods and I’m wondering what exactly is going on, or what I’m missing. I’m using cuda 2.3 running on Centos5 with a gtx295 and calling my cuda code from a mex-file. below are some example execution times:
using cudaprof, GPU time: 24.16
using cuda events: 133usec
The GPU time column on the cuda profiler is reporting GPU execution time in usec, right?
Also, I’m using cudaThreadSynchronize() before I start the event timer and before I stop the event timer in order to make sure that I’m getting an accurate time using events, so I’m not sure why the results are so different>
Please help.
Thanks in advance.
// check for proper data type
if(!mxIsSingle(prhs[0])){
mexErrMsgTxt("Input array should be of type float!");
}
if(!mxIsUint32(prhs[1])){
mexErrMsgTxt("FFT size should be of type unsigned int!");
}
// declare host arrays
float *time_r_host,*time_i_host;
float *freq_r_host, *freq_i_host;
cufftComplex *time_host, *freq_host;
// declare host vars
unsigned int inlen,nfft,numfft;
unsigned int llidx;
// declare device arrays
cufftComplex *time_gpu;
cufftComplex *freq_gpu;
// declare gpu timing related vars
cudaEvent_t startcufft, stopcufft;
float timecufft;
// create event objects for gpu timing
cudaEventCreate(&startcufft);
cudaEventCreate(&stopcufft);
// get array dimensions and user inut params
nfft = (unsigned int)mxGetScalar(prhs[1]);
inlen = (unsigned int)mxGetM(prhs[0]);
numfft = (unsigned int)mxGetN(prhs[0]);
// assume that the fft size is greater than the number of ffts to take
if(numfft > inlen){
unsigned int temp = numfft;
numfft = inlen;
inlen = temp;
}
// allocate host memory
time_r_host = (float*) malloc(numfft*nfft*sizeof(float));
time_i_host = (float*) malloc(numfft*nfft*sizeof(float));
time_host = (cufftComplex*)malloc(numfft*nfft*sizeof(cufftComplex));
freq_host = (cufftComplex*)malloc(numfft*nfft*sizeof(cufftComplex));
// copy over host array
memcpy(time_r_host, mxGetPr(prhs[0]), numfft*nfft*sizeof(float));
memcpy(time_i_host, mxGetPi(prhs[0]), numfft*nfft*sizeof(float));
for(llidx=0;llidx<numfft*nfft;llidx++){
time_host[llidx].x = time_r_host[llidx];
time_host[llidx].y = time_i_host[llidx];
}
//allocate device memory
cudaMalloc((void**) &time_gpu, numfft*nfft*sizeof(cufftComplex));
cudaMalloc((void**) &freq_gpu, numfft*nfft*sizeof(cufftComplex));
// check for cuda errors
checkCUDAError("cudaMalloc errors!");
// copy from host to device
cudaMemcpy(time_gpu, time_host, numfft*nfft*sizeof(cufftComplex),cudaMemcpyHostToDevice);
// create fft plan
cufftHandle fft_plan;
cufftPlan1d(&fft_plan, nfft, CUFFT_C2C, numfft);
// make sure prior device ops are finished before beginning device timer
cudaThreadSynchronize();
// record device start time
cudaEventRecord(startcufft, 0);
// execute the fft
cufftExecC2C(fft_plan, time_gpu, freq_gpu, CUFFT_FORWARD);
// block host thread until device finishes executing
cudaThreadSynchronize();
// record device stop time
cudaEventRecord(stopcufft, 0);
// wait till event has actually been recorded
cudaEventSynchronize(stopcufft);
// get elapsed time of device executiom
cudaEventElapsedTime(&timecufft, startcufft, stopcufft);
// destroy event objects
cudaEventDestroy(startcufft);
cudaEventDestroy(stopcufft);
// destroy fft plan
cufftDestroy(fft_plan);
// check for cuda errors
checkCUDAError("cuda cufft event error!");
// copy result to host
cudaMemcpy(freq_host, freq_gpu, nfft*numfft*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
// set-up outputs
// output fft result
const mwSize fftSize[2] = {nfft, numfft};
plhs[0] = mxCreateNumericArray(2, fftSize, mxSINGLE_CLASS, mxCOMPLEX);
freq_r_host = (float*) mxGetPr(plhs[0]);
freq_i_host = (float*) mxGetPi(plhs[0]);
for(llidx=0;llidx<nfft*numfft;llidx++){
freq_r_host[llidx] = freq_host[llidx].x;
freq_i_host[llidx] = freq_host[llidx].y;
}
// output fft execution time
plhs[1] = mxCreateDoubleScalar((double) timecufft);
// free up memory on device
cudaFree(time_gpu);
cudaFree(freq_gpu);
// free up memory on host
free(time_r_host);
free(time_i_host);
free(time_host);
free(freq_host);