Hello,
I wrote a simple kernel and was trying to get the kernel execution time. Here is the code
#include <iostream> #include <stdlib.h>
#include <cmath>
#include <assert.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <curand.h>
using namespace std;
#define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
if (stat != cudaSuccess) {
fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(stat), file, line);
}
}
__global__ void vecAdd(float *A, float *B, float *C, int num_elements) {
int i = threadIdx.x;
if(i < num_elements) {
//printf("%d: %f, %f\n", i, A[i], B[i]);
C[i] = A[i] + B[i];
}
}
void hostVecAdd(float *A, float *B, float *C, int DIM) {
for (int i=0; i<DIM; i++) {
C[i] = A[i] + B[i];
}
}
int main(int argc, char** argv) {
int DIM = 32;
int device = 0;
cudaErrCheck(cudaSetDevice(device));
cudaEvent_t start_kernel;
cudaEvent_t stop_kernel;
cudaErrCheck(cudaEventCreate(&start_kernel));
cudaErrCheck(cudaEventCreate(&stop_kernel));
float time = 0.0f;
// Allocate A, B, C on host
float *A, *B;
float *C;
A = (float*)malloc(DIM*sizeof(float));
B = (float*)malloc(DIM*sizeof(float));
C = (float*)malloc(DIM*sizeof(float));
for (int i=0; i<DIM; i++) {
A[i] = 1.0f;
B[i] = 1.0f;
C[i] = 0;
}
// Allocate A,B,C on device
float *d_A, *d_B;
float *d_C;
cudaErrCheck(cudaMalloc((void**)&d_A, DIM*sizeof(float)));
cudaErrCheck(cudaMalloc((void**)&d_B, DIM*sizeof(float)));
cudaErrCheck(cudaMalloc((void**)&d_C, DIM*sizeof(float)));
cudaErrCheck(cudaMemcpy(d_A, A, DIM*sizeof(float), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_B, B, DIM*sizeof(float), cudaMemcpyHostToDevice));
cudaErrCheck(cudaEventRecord(start_kernel));
vecAdd<<<1,DIM>>>(d_A, d_B, d_C, DIM);
cudaErrCheck(cudaGetLastError());
cudaErrCheck(cudaEventRecord(stop_kernel));
//cudaErrCheck(cudaEventSynchronize(stop_kernel));
cudaErrCheck(cudaEventElapsedTime(&time, start_kernel, stop_kernel));
cudaErrCheck(cudaMemcpy(C, d_C, DIM*sizeof(float), cudaMemcpyDeviceToHost));
cout << "Result Device:" << endl;
for (int i=0; i<DIM; i++) {
cout << C[i] << " ";
}
cout << endl;
cout << "vecAdd time: "<< time * 1000 << " us" << endl;
/*
hostVecAdd(A, B, C, DIM);
cout << "Result Host:" << endl;
for (int i=0; i<DIM; i++) {
cout << C[i];
}
cout << endl;
*/
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(A);
free(B);
free(C);
}
I am calculating the kernel time using events start_kernel and stop_kernel and using cuda API cudaEventElapsedTime. This return ~21 us.
when I run it using nsys using following command
nsys profile --stats=true ./vecAdd
it takes 53 us. So clearly nsys has large overhead, which is ok. The output from nsys looks like following
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------- ------------- ---------- ------------- --------------- --------------
52.2 1,681,305,041 2 840,652,520.5 840,652,520.5 11,843,901 1,669,461,140 1,172,112,390.3 sem_wait
36.5 1,174,649,734 1,627 721,972.8 75,973.0 464 300,060,770 8,384,690.4 ioctl
9.4 301,427,246 14 21,530,517.6 10,607,628.0 10,976 100,169,539 29,018,294.7 poll
1.0 31,902,710 113 282,324.9 34,526.0 4,523 6,256,381 668,640.8 open64
0.5 16,635,107 1 16,635,107.0 16,635,107.0 16,635,107 16,635,107 0.0 waitpid
0.2 5,892,875 1 5,892,875.0 5,892,875.0 5,892,875 5,892,875 0.0 fork
0.1 2,755,539 10 275,553.9 295,848.5 187,606 424,468 76,770.9 sem_timedwait
0.1 2,551,867 46 55,475.4 14,510.0 2,614 1,217,856 177,593.6 mmap64
0.0 670,803 84 7,985.8 4,434.5 1,346 50,300 8,457.2 fopen
0.0 508,210 4 127,052.5 136,539.5 81,315 153,816 32,756.3 pthread_create
0.0 212,377 77 2,758.1 1,811.0 892 9,691 2,197.5 fclose
0.0 185,260 22 8,420.9 3,910.5 535 65,188 13,857.5 mmap
0.0 93,500 73 1,280.8 68.0 53 45,071 5,831.4 fgets
0.0 76,905 119 646.3 457.0 201 2,242 489.8 fcntl
0.0 76,618 15 5,107.9 6,119.0 1,142 6,825 1,927.8 read
0.0 63,431 5 12,686.2 7,272.0 2,306 32,633 12,220.1 open
0.0 59,103 5 11,820.6 11,184.0 3,539 21,894 7,130.1 munmap
0.0 44,676 11 4,061.5 4,315.0 1,320 7,208 2,000.5 write
0.0 20,312 2 10,156.0 10,156.0 9,722 10,590 613.8 socket
0.0 13,970 1 13,970.0 13,970.0 13,970 13,970 0.0 connect
0.0 13,696 1 13,696.0 13,696.0 13,696 13,696 0.0 pipe2
0.0 12,702 68 186.8 54.0 51 7,923 954.5 fwrite
0.0 12,304 3 4,101.3 1,753.0 643 9,908 5,059.3 putc
0.0 11,512 1 11,512.0 11,512.0 11,512 11,512 0.0 fread
0.0 5,008 1 5,008.0 5,008.0 5,008 5,008 0.0 bind
0.0 4,382 1 4,382.0 4,382.0 4,382 4,382 0.0 fopen64
0.0 4,329 8 541.1 478.0 243 1,032 232.9 dup
0.0 1,652 1 1,652.0 1,652.0 1,652 1,652 0.0 listen
0.0 1,563 19 82.3 52.0 49 485 100.2 fflush
[5/8] Executing 'cuda_api_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------- --------- -------- --------- ----------- ----------------------
43.7 1,033,295 3 344,431.7 10,778.0 5,579 1,016,938 582,413.4 cudaFree
37.4 884,739 3 294,913.0 6,941.0 5,277 872,521 500,223.9 cudaMalloc
12.2 289,214 1 289,214.0 289,214.0 289,214 289,214 0.0 cuLibraryLoadData
2.9 67,450 3 22,483.3 24,828.0 8,436 34,186 13,034.1 cudaMemcpy
1.8 43,523 1 43,523.0 43,523.0 43,523 43,523 0.0 cudaLaunchKernel
1.2 27,975 2 13,987.5 13,987.5 4,309 23,666 13,687.5 cudaEventRecord
0.7 15,632 2 7,816.0 7,816.0 1,608 14,024 8,779.4 cudaEventCreate
0.2 4,054 1 4,054.0 4,054.0 4,054 4,054 0.0 cuModuleGetLoadingMode
[6/8] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- -------- -------- -------- -------- ----------- --------------------------------------
100.0 1,761 1 1,761.0 1,761.0 1,761 1,761 0.0 vecAdd(float *, float *, float *, int)
[7/8] Executing 'cuda_gpu_mem_time_sum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ----- -------- -------- -------- -------- ----------- ------------------
61.2 2,880 1 2,880.0 2,880.0 2,880 2,880 0.0 [CUDA memcpy DtoH]
38.8 1,824 2 912.0 912.0 800 1,024 158.4 [CUDA memcpy HtoD]
[8/8] Executing 'cuda_gpu_mem_size_sum' stats report
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ----- -------- -------- -------- -------- ----------- ------------------
0.000 2 0.000 0.000 0.000 0.000 0.000 [CUDA memcpy HtoD]
0.000 1 0.000 0.000 0.000 0.000 0.000 [CUDA memcpy DtoH]
section 6/8 is the kernel time for vecAdd and its just 1.7 us.
So my question is why is cuda API cudaEventElapsedTime reports 21 us vs 1.7us from nsys profile? How to measure accurate kernel time without nsys?