Why the measure time for second kernel is extremely short?


__global__ void test_kernel(int *ans, int *tmp_buf) {
    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int warp_id = thread_id / warp_size;
    int warp_lane = warp_id & (warps_per_block - 1);
    int thread_lane = thread_id & (warp_size - 1);

    int count = 0;
    int *tmp = tmp_buf + warp_id * 1000;

    for (int i = thread_lane; i < 64; i += warp_size) {
        count += i;
    }
    if (thread_lane == 0) atomicAdd(ans, count);
}

int main() {

    int ans = 0;
    auto elapsed = 0.0f;

    int *d_ans, *tmp_buf;
    cudaMalloc(&d_ans, sizeof(int));
    cudaMemset(d_ans, 0, sizeof(int));
    cudaMalloc(&tmp_buf, warp_count * 1000 * sizeof(int));

    int warmup = 10;
    cudaEvent_t t1, t2;
    cudaEventCreate(&t1);
    cudaEventCreate(&t2);
    cudaEventRecord(t1, 0);
    test_kernel<<<grid_size, block_size>>>(d_ans, tmp_buf);
    
    cudaEventRecord(t2, 0);
    cudaEventSynchronize(t2);
    cudaEventElapsedTime(&elapsed, t1, t2);
    std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;
    cudaMemcpy(&ans, d_ans, sizeof(int), cudaMemcpyDeviceToHost);

    cudaMemset(d_ans, 0, sizeof(int));

    cudaEventRecord(t1, 0);
    
    test_kernel<<<grid_size, block_size>>>(d_ans, tmp_buf);
    
    cudaEventRecord(t2, 0);
    cudaEventSynchronize(t2);
    cudaEventElapsedTime(&elapsed, t1, t2);
    std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;
    cudaMemcpy(&ans, d_ans, sizeof(int), cudaMemcpyDeviceToHost);


    cudaEventRecord(t1, 0);
    
    test_kernel<<<grid_size, block_size>>>(d_ans, tmp_buf);
    
    cudaEventRecord(t2, 0);
    cudaEventSynchronize(t2);
    cudaEventElapsedTime(&elapsed, t1, t2);
    std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;
    cudaMemcpy(&ans, d_ans, sizeof(int), cudaMemcpyDeviceToHost);

    return 0;
}

Hi, I got a strange problem when measuring kernel time with cuda event.

Why the time of first execution is much longer than the later one, I just measure the same kernel?
How to measure it correctly?
Does it mean that I need to run it in several iterations?
Like

cudaEventRecord(t1, 0);
for (int i = 0; i < iterations; ++i) 
      kernel<<<>>>>();
cudaEventRecord(t2, 0);
cudaEventSynchronize(t2);
cudaEventElapsedTime(&elapsed, t1, t2);
auto time =  elapsed / iterations;

Or I need to put the event inside the loop, measuring once for each iteration?

There are warm-up times involved for the first kernel to load and possibly compile it.

See also 1. Introduction — CUDA C++ Programming Guide (lazy loading)

You can also modify the behaviour with an environment variable.

For accurate benchmarking you should always throw away the first result.

I know the lazy loading. However, I use the same query code

assert(CUDA_SUCCESS == cuInit(0));
  assert(CUDA_SUCCESS == cuModuleGetLoadingMode(&mode));

  std::cout << "CUDA Module Loading Mode is " << ((mode == CU_MODULE_LAZY_LOADING) ? "lazy" : "eager") << std::endl;

It shows I am in eager mode.

Do I still need to through away the first result?

There are various other reasons, the first result can be slower, e.g. the GPU reduces the frequency or turns off some SMs, when idle. And it takes a small bit of time to accelerate the frequency again.

Some reasons are GPU-wide (you can warm up with other kernels), some are kernel specific.

Actually I test another my hand-written kernel;

cudaEvent_t t1, t2;
    cudaEventCreate(&t1);
    cudaEventCreate(&t2);
    cudaEventRecord(t1, 0);
    test_kernel2<<<grid_size, block_size>>>(d_ans, tmp_buf);
    
    cudaEventRecord(t2, 0);
    cudaEventSynchronize(t2);
    cudaEventElapsedTime(&elapsed, t1, t2);
    std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;
    cudaMemcpy(&ans, d_ans, sizeof(int), cudaMemcpyDeviceToHost);

cudaEventRecord(t1, 0);
    
    test_kernel2<<<grid_size, block_size>>>(d_ans, tmp_buf);
    
    cudaEventRecord(t2, 0);
    cudaEventSynchronize(t2);
    cudaEventElapsedTime(&elapsed, t1, t2);
std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;

I run the same kernel twice using cuda event for each kernel;
However, the second time results is always near 0. I don’t know what is the reason?

image

Does your kernels run successfully? Perhaps there was an error (e.g. index overflow) and you have to reset the GPU state after the first kernel run?