I am getting different timings (shown as bold in the results below) of the same kernel when executed twice, any idea why this happens?
Results:
hepta iterations = 172, err = 774.110537
CPU Execution time: 4943.710000 ms
cuda hepta iterations = 172, err = 774.110537
GPU Execution time: 200.432632 ms
Correct!
cuda hepta iterations = 172, err = 774.110537
GPU second Execution time: 170.358490 ms
Correct!
Following is the code snippet for kernel execution:
TYPE *b, *A_hepta, *x_hepta, *x_hepta_cuda;
cudaCheck(cudaMallocManaged(&A_hepta, n * 7 * sizeof(TYPE)), __LINE__);
cudaCheck(cudaMallocManaged(&b, n * sizeof(TYPE)), __LINE__);
x_hepta = (TYPE*) malloc(n * sizeof(TYPE));
cudaCheck(cudaMallocManaged(&x_hepta_cuda, n * sizeof(TYPE)), __LINE__);
init(A_hepta, b, x_hepta, x_hepta_cuda, n);
int deviceID;
cudaGetDevice(&deviceID);
cudaCheck(cudaMemPrefetchAsync(A_hepta, n * 7 * sizeof(TYPE), deviceID));
cudaCheck(cudaMemPrefetchAsync(b, n * sizeof(TYPE), deviceID));
cudaCheck(cudaMemPrefetchAsync(x_hepta_cuda, n * sizeof(TYPE), deviceID));
struct timeval start, end;
nvtxRangePushA("bicgstab_hepta");
gettimeofday(&start, NULL);
bicgstab_hepta(A_hepta, b, x_hepta, n);
gettimeofday(&end, NULL);
double elapsedTime = (end.tv_sec - start.tv_sec) * 1000.0; // sec to ms
elapsedTime += (end.tv_usec - start.tv_usec) / 1000.0; // us to ms
nvtxRangePop();
printf("CPU Execution time: %.6f ms\n", elapsedTime);
cudaEvent_t cuda_start, cuda_stop;
cudaEventCreate(&cuda_start);
cudaEventCreate(&cuda_stop);
float milliseconds = 0;
nvtxRangePushA("bicgstab_hepta_cuda");
cudaEventRecord(cuda_start);
**bicgstab_hepta_cuda(A_hepta, b, x_hepta_cuda, n);**
cudaCheck(cudaDeviceSynchronize(), __LINE__);
cudaEventRecord(cuda_stop);
cudaEventSynchronize(cuda_stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, cuda_start, cuda_stop);
nvtxRangePop();
printf("GPU Execution time: %.6f ms\n", milliseconds);
check_result(x_hepta, x_hepta_cuda, n);
//Reallocate and ReInitialize CUDA Memories
cudaFree(A_hepta);
cudaFree(b);
cudaFree(x_hepta_cuda);
cudaCheck(cudaMallocManaged(&A_hepta, n * 7 * sizeof(TYPE)), __LINE__);
cudaCheck(cudaMallocManaged(&b, n * sizeof(TYPE)), __LINE__);
cudaCheck(cudaMallocManaged(&x_hepta_cuda, n * sizeof(TYPE)), __LINE__);
init(A_hepta, b, x_hepta, x_hepta_cuda, n, GPU_INIT);
cudaCheck(cudaMemPrefetchAsync(A_hepta, n * 7 * sizeof(TYPE), deviceID));
cudaCheck(cudaMemPrefetchAsync(b, n * sizeof(TYPE), deviceID));
cudaCheck(cudaMemPrefetchAsync(x_hepta_cuda, n * sizeof(TYPE), deviceID));
nvtxRangePushA("bicgstab_hepta_second");
cudaEventRecord(cuda_start);
**bicgstab_hepta_cuda(A_hepta, b, x_hepta_cuda, n);**
cudaCheck(cudaDeviceSynchronize(), __LINE__);
cudaEventRecord(cuda_stop);
cudaEventSynchronize(cuda_stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, cuda_start, cuda_stop);
nvtxRangePop();
printf("GPU second Execution time: %.6f ms\n", milliseconds);
cudaEventDestroy(cuda_start);
cudaEventDestroy(cuda_stop);
check_result(x_hepta, x_hepta_cuda, n);
free(x_hepta);
cudaFree(A_hepta);
cudaFree(b);
cudaFree(x_hepta_cuda);