kernel launch overhead timing best practices

Hi all,

I was just wondering what’s the best way to time the overhead associated with a kernel launch? For example, I would think that I should use a host-side timer. Is this correct?

Also, I wrote an empty kernel and have been timing it using cudaEvents, which I think is the wrong way to measure kernel launch overhead because events are timed on the host. However I observed that the times reported from running this empty kernel increase approximately linearly with the number of blocks launched. Is this as expected? I would think that since all blocks do the same thing(nothing) that the on-gpu time reported by the events timer would be constant for any number of blocks. The code posted below illustrates what I’m doing.

#include "mex.h"

#include "cuda.h"

#include "cuda_runtime.h"

#include "cufft.h"

#include "matrix.h"

// empty kernel

__global__ void empty(){

}

// simple utility function

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err) 

	{

		mexPrintf("Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));

		/* exit(EXIT_FAILURE); */  /* this line will cause Matlab to exit as well */

	}                         

}

// main mex-function

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){

// check for proper data type

    if(!mxIsSingle(prhs[0]) || !mxIsSingle(prhs[2])){

        mexErrMsgTxt("Input array and filter coefficients should be of type float!");

    }

// declare host arrays

    float *input_r_host,*input_i_host;

    float *output_r_host, *output_i_host;

    float *filt_host;

// declare host vars

    unsigned int inlen;

    unsigned int filtlen;

    unsigned int llidx;

// declare device arrays

    float *time_r_gpu, *time_i_gpu;

    float *filt_gpu;

// declare gpu timing related vars

    cudaEvent_t startempty, stopempty;

    float timeempty;

// create event objects for gpu timing

    cudaEventCreate(&startempty);

    cudaEventCreate(&stopempty);

// get array dimensions and user inut params    

    inlen = (unsigned int) mxGetScalar(prhs[1]);

filtlen = (unsigned int) mxGetScalar(prhs[3]);       

// allocate host memory

    input_r_host = (float*) malloc(inlen*sizeof(float));

    input_i_host = (float*) malloc(inlen*sizeof(float));

    filt_host = (float*) malloc(filtlen*sizeof(float));

// copy over host array

    memcpy(input_r_host, mxGetPr(prhs[0]), inlen*sizeof(float));

    memcpy(input_i_host, mxGetPi(prhs[0]), inlen*sizeof(float));

    memcpy(filt_host, mxGetData(prhs[2]), filtlen*sizeof(float));

//allocate device memory

    cudaMalloc((void**) &time_r_gpu, inlen*sizeof(float));

    cudaMalloc((void**) &time_i_gpu, inlen*sizeof(float));

    cudaMalloc((void**) &filt_gpu, filtlen*sizeof(float));

// check for cuda errors

    checkCUDAError("cudaMalloc errors!");

// copy from host to device

    cudaMemcpy(time_r_gpu, input_r_host, inlen*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpy(time_i_gpu, input_i_host, inlen*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpy(filt_gpu, filt_host, filtlen*sizeof(float),cudaMemcpyHostToDevice);

// make sure prior device ops are finished before beginning device timer

    cudaThreadSynchronize();

// record device start time

    cudaEventRecord(startempty, 0);

// execute the empty kernel

    dim3 threads(512);

    dim3 blocks((inlen+threads.x-1)/threads.x);

    empty<<<blocks, threads>>>();

// check for cuda errors

    checkCUDAError("emptyerrors!");

// block host thread until device finishes executing

    cudaThreadSynchronize();

// record device stop time

    cudaEventRecord(stopempty, 0);

// wait till event has actually been recorded

    cudaEventSynchronize(stopempty);

// get elapsed time of device executiom

    cudaEventElapsedTime(&timeempty, startempty, stopempty);

// destroy event objects

    cudaEventDestroy(startempty);

    cudaEventDestroy(stopempty);

// check for cuda errors

    checkCUDAError("cuda empty event error!");

// set-up outputs

// output fft result

    const mwSize outSize[2] = {inlen, 1};

    plhs[0] = mxCreateNumericArray(2, outSize, mxSINGLE_CLASS, mxCOMPLEX);

    output_r_host = (float*) mxGetPr(plhs[0]);

    output_i_host = (float*) mxGetPi(plhs[0]);

// copy result to host

    cudaMemcpy(output_r_host, time_r_gpu, inlen*sizeof(float), cudaMemcpyDeviceToHost);

    cudaMemcpy(output_i_host, time_i_gpu, inlen*sizeof(float), cudaMemcpyDeviceToHost);

// output fft execution time

    plhs[1] = mxCreateDoubleScalar((double) timeempty);

// free up memory on device

    cudaFree(time_r_gpu);

    cudaFree(time_i_gpu);

    cudaFree(filt_gpu);

// free up memory on host

    free(input_r_host);

    free(input_i_host);

    free(filt_host);

}

I think it would be more surprising if what you are expecting were true. That would imply that the driver or runtime libraries were somehow able to analyse the code being requested for load, determine that it did nothing before it had been run, then not run it. That almost sounds like a solving a weak version of the halting problem to me.

You’re right. I suppose I should brush up on some computability theory ;).

A thorough discussion on CUDA kernel launch overhead is contained in Section 6.1.1 of the CUDA Handbook by N. Wilt.