Check out section “3.2.6.2 Event” in the programming guide:
The following code sample creates two events:
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
These events can be used to time the code sample of the previous section the following way:
cudaEventRecord(start, 0);
for (int i = 0; i < 2; ++i)
cudaMemcpyAsync(inputDev + i * size, inputHost + i * size, size, cudaMemcpyHostToDevice, stream[i]);
for (int i = 0; i < 2; ++i)
myKernel<<<100, 512, 0, stream[i]>>>(outputDev + i * size, inputDev + i * size, size);
for (int i = 0; i < 2; ++i)
cudaMemcpyAsync(outputHost + i * size, outputDev + i * size, size, cudaMemcpyDeviceToHost, stream[i]);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
They are destroyed this way:
cudaEventDestroy(start);
cudaEventDestroy(stop);
It looks like the nbody SDK example project uses these event timers:
void runBenchmark(int iterations)
{
// once without timing to prime the GPU
nbody->update(activeParams.m_timestep);
cutilSafeCall(cudaEventRecord(startEvent, 0));
for (int i = 0; i < iterations; ++i)
{
nbody->update(activeParams.m_timestep);
}
cutilSafeCall(cudaEventRecord(stopEvent, 0));
cudaEventSynchronize(stopEvent);
float milliseconds = 0;
cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
double interactionsPerSecond = 0;
double gflops = 0;
computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations);
printf("%d bodies, total time for %d iterations: %0.3f ms\n",
numBodies, iterations, milliseconds);
printf("= %0.3f billion interactions per second\n", interactionsPerSecond);
printf("= %0.3f GFLOP/s at %d flops per interaction\n", gflops, 20);
}
Keep in mind however, these are just device timers and will only time what’s going on with the hardware…perfect if you are just trying to time a kernel.