Hi all,
I was just wondering what’s the best way to time the overhead associated with a kernel launch? For example, I would think that I should use a host-side timer. Is this correct?
Also, I wrote an empty kernel and have been timing it using cudaEvents, which I think is the wrong way to measure kernel launch overhead because events are timed on the host. However I observed that the times reported from running this empty kernel increase approximately linearly with the number of blocks launched. Is this as expected? I would think that since all blocks do the same thing(nothing) that the on-gpu time reported by the events timer would be constant for any number of blocks. The code posted below illustrates what I’m doing.
#include "mex.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "cufft.h"
#include "matrix.h"
// empty kernel
__global__ void empty(){
}
// simple utility function
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
mexPrintf("Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
/* exit(EXIT_FAILURE); */ /* this line will cause Matlab to exit as well */
}
}
// main mex-function
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){
// check for proper data type
if(!mxIsSingle(prhs[0]) || !mxIsSingle(prhs[2])){
mexErrMsgTxt("Input array and filter coefficients should be of type float!");
}
// declare host arrays
float *input_r_host,*input_i_host;
float *output_r_host, *output_i_host;
float *filt_host;
// declare host vars
unsigned int inlen;
unsigned int filtlen;
unsigned int llidx;
// declare device arrays
float *time_r_gpu, *time_i_gpu;
float *filt_gpu;
// declare gpu timing related vars
cudaEvent_t startempty, stopempty;
float timeempty;
// create event objects for gpu timing
cudaEventCreate(&startempty);
cudaEventCreate(&stopempty);
// get array dimensions and user inut params
inlen = (unsigned int) mxGetScalar(prhs[1]);
filtlen = (unsigned int) mxGetScalar(prhs[3]);
// allocate host memory
input_r_host = (float*) malloc(inlen*sizeof(float));
input_i_host = (float*) malloc(inlen*sizeof(float));
filt_host = (float*) malloc(filtlen*sizeof(float));
// copy over host array
memcpy(input_r_host, mxGetPr(prhs[0]), inlen*sizeof(float));
memcpy(input_i_host, mxGetPi(prhs[0]), inlen*sizeof(float));
memcpy(filt_host, mxGetData(prhs[2]), filtlen*sizeof(float));
//allocate device memory
cudaMalloc((void**) &time_r_gpu, inlen*sizeof(float));
cudaMalloc((void**) &time_i_gpu, inlen*sizeof(float));
cudaMalloc((void**) &filt_gpu, filtlen*sizeof(float));
// check for cuda errors
checkCUDAError("cudaMalloc errors!");
// copy from host to device
cudaMemcpy(time_r_gpu, input_r_host, inlen*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(time_i_gpu, input_i_host, inlen*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(filt_gpu, filt_host, filtlen*sizeof(float),cudaMemcpyHostToDevice);
// make sure prior device ops are finished before beginning device timer
cudaThreadSynchronize();
// record device start time
cudaEventRecord(startempty, 0);
// execute the empty kernel
dim3 threads(512);
dim3 blocks((inlen+threads.x-1)/threads.x);
empty<<<blocks, threads>>>();
// check for cuda errors
checkCUDAError("emptyerrors!");
// block host thread until device finishes executing
cudaThreadSynchronize();
// record device stop time
cudaEventRecord(stopempty, 0);
// wait till event has actually been recorded
cudaEventSynchronize(stopempty);
// get elapsed time of device executiom
cudaEventElapsedTime(&timeempty, startempty, stopempty);
// destroy event objects
cudaEventDestroy(startempty);
cudaEventDestroy(stopempty);
// check for cuda errors
checkCUDAError("cuda empty event error!");
// set-up outputs
// output fft result
const mwSize outSize[2] = {inlen, 1};
plhs[0] = mxCreateNumericArray(2, outSize, mxSINGLE_CLASS, mxCOMPLEX);
output_r_host = (float*) mxGetPr(plhs[0]);
output_i_host = (float*) mxGetPi(plhs[0]);
// copy result to host
cudaMemcpy(output_r_host, time_r_gpu, inlen*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(output_i_host, time_i_gpu, inlen*sizeof(float), cudaMemcpyDeviceToHost);
// output fft execution time
plhs[1] = mxCreateDoubleScalar((double) timeempty);
// free up memory on device
cudaFree(time_r_gpu);
cudaFree(time_i_gpu);
cudaFree(filt_gpu);
// free up memory on host
free(input_r_host);
free(input_i_host);
free(filt_host);
}