I try to evaluate elapsed time of GPU. However, its result confuses me.
for (100){
for (100){
for (10){
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
Call CUDA kernel
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
}
}
This is my first try of evaluation. I got the elapsed time of my CUDA kernel. It is around 0.02ms
However, when I move the cudaEvent outside the for loop to evaluate all of its elapsed time. Like this,
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (100){
for (100){
for (10){
Call CUDA kernel
}
}
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
The result is around 200 ms. Why it is totally difference between two style? I think second elapsed time should be around 100 * 100 * 10 * 0.02 = 2000 ms?
Any explanation is welcome. Thank