__global__ void test_kernel(int *ans, int *tmp_buf) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int warp_id = thread_id / warp_size;
int warp_lane = warp_id & (warps_per_block - 1);
int thread_lane = thread_id & (warp_size - 1);
int count = 0;
int *tmp = tmp_buf + warp_id * 1000;
for (int i = thread_lane; i < 64; i += warp_size) {
count += i;
}
if (thread_lane == 0) atomicAdd(ans, count);
}
int main() {
int ans = 0;
auto elapsed = 0.0f;
int *d_ans, *tmp_buf;
cudaMalloc(&d_ans, sizeof(int));
cudaMemset(d_ans, 0, sizeof(int));
cudaMalloc(&tmp_buf, warp_count * 1000 * sizeof(int));
int warmup = 10;
cudaEvent_t t1, t2;
cudaEventCreate(&t1);
cudaEventCreate(&t2);
cudaEventRecord(t1, 0);
test_kernel<<<grid_size, block_size>>>(d_ans, tmp_buf);
cudaEventRecord(t2, 0);
cudaEventSynchronize(t2);
cudaEventElapsedTime(&elapsed, t1, t2);
std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;
cudaMemcpy(&ans, d_ans, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemset(d_ans, 0, sizeof(int));
cudaEventRecord(t1, 0);
test_kernel<<<grid_size, block_size>>>(d_ans, tmp_buf);
cudaEventRecord(t2, 0);
cudaEventSynchronize(t2);
cudaEventElapsedTime(&elapsed, t1, t2);
std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;
cudaMemcpy(&ans, d_ans, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(t1, 0);
test_kernel<<<grid_size, block_size>>>(d_ans, tmp_buf);
cudaEventRecord(t2, 0);
cudaEventSynchronize(t2);
cudaEventElapsedTime(&elapsed, t1, t2);
std::cout << "GPU elapsed time: " << elapsed << " ms" << std::endl;
cudaMemcpy(&ans, d_ans, sizeof(int), cudaMemcpyDeviceToHost);
return 0;
}
Hi, I got a strange problem when measuring kernel time with cuda event.
Why the time of first execution is much longer than the later one, I just measure the same kernel?
How to measure it correctly?
Does it mean that I need to run it in several iterations?
Like
cudaEventRecord(t1, 0);
for (int i = 0; i < iterations; ++i)
kernel<<<>>>>();
cudaEventRecord(t2, 0);
cudaEventSynchronize(t2);
cudaEventElapsedTime(&elapsed, t1, t2);
auto time = elapsed / iterations;
Or I need to put the event inside the loop, measuring once for each iteration?
