Different GPU execution times using cuda events and cudaprof

cudaprog1691 · January 21, 2011, 5:30pm

I’ve been timing some basic GPU code that simply executes an fft using cuda events, and also timing the same code using the cuda profiler. I’m getting very different results for the two methods and I’m wondering what exactly is going on, or what I’m missing. I’m using cuda 2.3 running on Centos5 with a gtx295 and calling my cuda code from a mex-file. below are some example execution times:

using cudaprof, GPU time: 24.16

using cuda events: 133usec

The GPU time column on the cuda profiler is reporting GPU execution time in usec, right?

Also, I’m using cudaThreadSynchronize() before I start the event timer and before I stop the event timer in order to make sure that I’m getting an accurate time using events, so I’m not sure why the results are so different>

Please help.

Thanks in advance.

// check for proper data type

    if(!mxIsSingle(prhs[0])){

        mexErrMsgTxt("Input array should be of type float!");

    }

if(!mxIsUint32(prhs[1])){

        mexErrMsgTxt("FFT size should be of type unsigned int!");

    }

// declare host arrays

    float *time_r_host,*time_i_host;

    float *freq_r_host, *freq_i_host;

    cufftComplex *time_host, *freq_host;

// declare host vars

    unsigned int inlen,nfft,numfft;

    unsigned int llidx;

// declare device arrays

    cufftComplex *time_gpu;

    cufftComplex *freq_gpu;

// declare gpu timing related vars

    cudaEvent_t startcufft, stopcufft;

    float timecufft;

// create event objects for gpu timing

    cudaEventCreate(&startcufft);

    cudaEventCreate(&stopcufft);

// get array dimensions and user inut params

    nfft = (unsigned int)mxGetScalar(prhs[1]);

inlen = (unsigned int)mxGetM(prhs[0]);

numfft = (unsigned int)mxGetN(prhs[0]);    

// assume that the fft size is greater than the number of ffts to take

    if(numfft > inlen){

        unsigned int temp = numfft;

        numfft = inlen;

        inlen = temp;

    }

// allocate host memory

    time_r_host = (float*) malloc(numfft*nfft*sizeof(float));

    time_i_host = (float*) malloc(numfft*nfft*sizeof(float));

    time_host = (cufftComplex*)malloc(numfft*nfft*sizeof(cufftComplex));

    freq_host = (cufftComplex*)malloc(numfft*nfft*sizeof(cufftComplex));

// copy over host array

    memcpy(time_r_host, mxGetPr(prhs[0]), numfft*nfft*sizeof(float));

    memcpy(time_i_host, mxGetPi(prhs[0]), numfft*nfft*sizeof(float));

for(llidx=0;llidx<numfft*nfft;llidx++){

        time_host[llidx].x = time_r_host[llidx];

        time_host[llidx].y = time_i_host[llidx];

    }

//allocate device memory

    cudaMalloc((void**) &time_gpu, numfft*nfft*sizeof(cufftComplex));

    cudaMalloc((void**) &freq_gpu, numfft*nfft*sizeof(cufftComplex));

// check for cuda errors

    checkCUDAError("cudaMalloc errors!");

// copy from host to device

    cudaMemcpy(time_gpu, time_host, numfft*nfft*sizeof(cufftComplex),cudaMemcpyHostToDevice);

// create fft plan

    cufftHandle fft_plan;

cufftPlan1d(&fft_plan, nfft, CUFFT_C2C, numfft);

// make sure prior device ops are finished before beginning device timer

    cudaThreadSynchronize();

// record device start time

    cudaEventRecord(startcufft, 0);

// execute the fft

    cufftExecC2C(fft_plan, time_gpu, freq_gpu, CUFFT_FORWARD);

// block host thread until device finishes executing

    cudaThreadSynchronize();

// record device stop time

    cudaEventRecord(stopcufft, 0);

// wait till event has actually been recorded

    cudaEventSynchronize(stopcufft);

// get elapsed time of device executiom

    cudaEventElapsedTime(&timecufft, startcufft, stopcufft);

// destroy event objects

    cudaEventDestroy(startcufft);

    cudaEventDestroy(stopcufft);

// destroy fft plan

    cufftDestroy(fft_plan);

// check for cuda errors

    checkCUDAError("cuda cufft event error!");

// copy result to host

    cudaMemcpy(freq_host, freq_gpu, nfft*numfft*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

// set-up outputs

// output fft result

    const mwSize fftSize[2] = {nfft, numfft};

    plhs[0] = mxCreateNumericArray(2, fftSize, mxSINGLE_CLASS, mxCOMPLEX);

    freq_r_host = (float*) mxGetPr(plhs[0]);

    freq_i_host = (float*) mxGetPi(plhs[0]);

for(llidx=0;llidx<nfft*numfft;llidx++){

        freq_r_host[llidx] = freq_host[llidx].x;

        freq_i_host[llidx] = freq_host[llidx].y;

    }   

// output fft execution time

    plhs[1] = mxCreateDoubleScalar((double) timecufft);

// free up memory on device

    cudaFree(time_gpu);

    cudaFree(freq_gpu);

// free up memory on host

    free(time_r_host);

    free(time_i_host);

    free(time_host);

    free(freq_host);

Topic		Replies	Views
How to get exact measurement of CPU and GPU running time? CUDA Programming and Performance cuda	2	1820	August 12, 2023
Number of GPU clock cycles CUDA Programming and Performance	15	10408	June 16, 2017
On timing and timer CUDA Programming and Performance	7	4191	July 15, 2009
cuda visual profiler CUDA Programming and Performance	12	8169	July 30, 2008
How to show CuFFT routines show higher performance than normal MATLAB fft() in terms of time taken. CUDA Programming and Performance	13	3171	July 10, 2014
GPU running time is not stable CUDA Programming and Performance	5	3058	April 24, 2010
cufftExecC2C and cudaMemcpyAsync Doing the FFT without 100% CPU usage CUDA Programming and Performance	6	27102	September 5, 2008
CUDA OpenCL comparison CUDA Programming and Performance	9	3403	August 23, 2011
timing performance of kernels how ? cudaprof vs cudaEventRecord vs cutStartTimer CUDA Programming and Performance	3	5300	March 21, 2009
CPU usage at 99% while kernel is running CUDA Programming and Performance	5	4464	September 3, 2008

Different GPU execution times using cuda events and cudaprof

Related topics