I tried to analyzed cuda runtime api by nsys, but I found some runtime API called dose not output in cuda_api_sum.
My code as follows:
using mt = float;
int main(){
size_t sz = 4096;
size_t msz = sz*sz;
dim3 grid = dim3(sz/16/2, sz/16);
dim3 block = dim3(16,16);
mt *d_MatA, *d_MatB;
float * host_a = (float *)malloc(sizeof(float)*msz);
cudaSetDevice(0);
cudaMalloc(&d_MatA, sizeof(float)*msz);
cudaMalloc(&d_MatB, sizeof(float)*msz);
cudaMemcpy(d_MatA, host_a, sizeof(float)*msz, cudaMemcpyDefault );
cudaFree(d_MatA);
cudaFree(d_MatB);
free(host_a);
}
use nsys command as follows:
nvcc -arch=sm_80 res.cu ; nsys profile --stats=true a.out
output as follows:
[5/8] Executing 'cuda_api_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ---------- ---------- -------- -------- ----------- ----------------------
97.4 14039502 1 14039502.0 14039502.0 14039502 14039502 0.0 cudaMemcpy
1.5 215939 2 107969.5 107969.5 67436 148503 57323.0 cudaFree
1.1 153313 2 76656.5 76656.5 50035 103278 37648.5 cudaMalloc
0.0 2004 1 2004.0 2004.0 2004 2004 0.0 cuModuleGetLoadingMode
I used cudaSetDevice(0)
in code, but why it doesn’t show in nsys output ?