Different GPU execution times using cuda events and cudaprof

I’ve been timing some basic GPU code that simply executes an fft using cuda events, and also timing the same code using the cuda profiler. I’m getting very different results for the two methods and I’m wondering what exactly is going on, or what I’m missing. I’m using cuda 2.3 running on Centos5 with a gtx295 and calling my cuda code from a mex-file. below are some example execution times:

using cudaprof, GPU time: 24.16

using cuda events: 133usec

The GPU time column on the cuda profiler is reporting GPU execution time in usec, right?

Also, I’m using cudaThreadSynchronize() before I start the event timer and before I stop the event timer in order to make sure that I’m getting an accurate time using events, so I’m not sure why the results are so different>

Please help.

Thanks in advance.

// check for proper data type

    if(!mxIsSingle(prhs[0])){

        mexErrMsgTxt("Input array should be of type float!");

    }

if(!mxIsUint32(prhs[1])){

        mexErrMsgTxt("FFT size should be of type unsigned int!");

    }

// declare host arrays

    float *time_r_host,*time_i_host;

    float *freq_r_host, *freq_i_host;

    cufftComplex *time_host, *freq_host;

// declare host vars

    unsigned int inlen,nfft,numfft;

    unsigned int llidx;

// declare device arrays

    cufftComplex *time_gpu;

    cufftComplex *freq_gpu;

// declare gpu timing related vars

    cudaEvent_t startcufft, stopcufft;

    float timecufft;

// create event objects for gpu timing

    cudaEventCreate(&startcufft);

    cudaEventCreate(&stopcufft);

// get array dimensions and user inut params

    nfft = (unsigned int)mxGetScalar(prhs[1]);

inlen = (unsigned int)mxGetM(prhs[0]);

numfft = (unsigned int)mxGetN(prhs[0]);    

// assume that the fft size is greater than the number of ffts to take

    if(numfft > inlen){

        unsigned int temp = numfft;

        numfft = inlen;

        inlen = temp;

    }

// allocate host memory

    time_r_host = (float*) malloc(numfft*nfft*sizeof(float));

    time_i_host = (float*) malloc(numfft*nfft*sizeof(float));

    time_host = (cufftComplex*)malloc(numfft*nfft*sizeof(cufftComplex));

    freq_host = (cufftComplex*)malloc(numfft*nfft*sizeof(cufftComplex));

// copy over host array

    memcpy(time_r_host, mxGetPr(prhs[0]), numfft*nfft*sizeof(float));

    memcpy(time_i_host, mxGetPi(prhs[0]), numfft*nfft*sizeof(float));

for(llidx=0;llidx<numfft*nfft;llidx++){

        time_host[llidx].x = time_r_host[llidx];

        time_host[llidx].y = time_i_host[llidx];

    }

//allocate device memory

    cudaMalloc((void**) &time_gpu, numfft*nfft*sizeof(cufftComplex));

    cudaMalloc((void**) &freq_gpu, numfft*nfft*sizeof(cufftComplex));

// check for cuda errors

    checkCUDAError("cudaMalloc errors!");

// copy from host to device

    cudaMemcpy(time_gpu, time_host, numfft*nfft*sizeof(cufftComplex),cudaMemcpyHostToDevice);

// create fft plan

    cufftHandle fft_plan;

cufftPlan1d(&fft_plan, nfft, CUFFT_C2C, numfft);

// make sure prior device ops are finished before beginning device timer

    cudaThreadSynchronize();

// record device start time

    cudaEventRecord(startcufft, 0);

// execute the fft

    cufftExecC2C(fft_plan, time_gpu, freq_gpu, CUFFT_FORWARD);

// block host thread until device finishes executing

    cudaThreadSynchronize();

// record device stop time

    cudaEventRecord(stopcufft, 0);

// wait till event has actually been recorded

    cudaEventSynchronize(stopcufft);

// get elapsed time of device executiom

    cudaEventElapsedTime(&timecufft, startcufft, stopcufft);

// destroy event objects

    cudaEventDestroy(startcufft);

    cudaEventDestroy(stopcufft);

// destroy fft plan

    cufftDestroy(fft_plan);

// check for cuda errors

    checkCUDAError("cuda cufft event error!");

// copy result to host

    cudaMemcpy(freq_host, freq_gpu, nfft*numfft*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

// set-up outputs

// output fft result

    const mwSize fftSize[2] = {nfft, numfft};

    plhs[0] = mxCreateNumericArray(2, fftSize, mxSINGLE_CLASS, mxCOMPLEX);

    freq_r_host = (float*) mxGetPr(plhs[0]);

    freq_i_host = (float*) mxGetPi(plhs[0]);

for(llidx=0;llidx<nfft*numfft;llidx++){

        freq_r_host[llidx] = freq_host[llidx].x;

        freq_i_host[llidx] = freq_host[llidx].y;

    }   

// output fft execution time

    plhs[1] = mxCreateDoubleScalar((double) timecufft);

// free up memory on device

    cudaFree(time_gpu);

    cudaFree(freq_gpu);

// free up memory on host

    free(time_r_host);

    free(time_i_host);

    free(time_host);

    free(freq_host);