Noticed that cudamalloc will affect the latency of the API call of the kernelLaunch that follows.
scene 1:separate cudamalloc before each calculation
In second loop , the first cudaLaunchKernel API CPU launching took about 14us ,and the rest times about 5us
code such as:
//malloc matrix first time , n means matrix size ,n=1024
cudaMalloc(&d_x1, sizeof(float)* n * n);
cudaMalloc(&d_y1, sizeof(float)* n * n);
cudaMemcpy(d_x1, x, sizeof(float)* n * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_y1, y, sizeof(float)* n * n, cudaMemcpyHostToDevice);
//the first loop
for (int j=0;j<KERNEL_NUMS;j++){
//in this loop,first time kernel launch 30us, the rest 5us
matMultCUDA <<< blocks_num, THREAD_NUM, 0, stream1 >>>(d_x1 , d_y1 , d_z , n );
}
cudaStreamSynchronize(stream1);
//malloc matrix again
cudaMalloc(&d_x2, sizeof(float)* n * n);
cudaMalloc(&d_y2, sizeof(float)* n * n);
cudaMemcpy(d_x2, x, sizeof(float)* n * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_y2, y, sizeof(float)* n * n, cudaMemcpyHostToDevice);
//the second loop
for (int j=0;j<KERNEL_NUMS;j++){
***//The first kernel launch took about 14us in this loop, rest is about 5us***
matMultCUDA <<< blocks_num, THREAD_NUM, 0, stream1 >>>(d_x2 , d_y2 , d_z , n );
}
cudaStreamSynchronize(stream1);
scene 2: Malloc all addresses before execution
The first cudaLaunchKernel API took about 5us in second loop
//malloc all address , n means matrix size ,n=1024
cudaMalloc(&d_x1, sizeof(float)* n * n);
cudaMalloc(&d_y1, sizeof(float)* n * n);
cudaMemcpy(d_x1, x, sizeof(float)* n * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_y1, y, sizeof(float)* n * n, cudaMemcpyHostToDevice);
cudaMalloc(&d_x2, sizeof(float)* n * n);
cudaMalloc(&d_y2, sizeof(float)* n * n);
cudaMemcpy(d_x2, x, sizeof(float)* n * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_y2, y, sizeof(float)* n * n, cudaMemcpyHostToDevice);
//the first loop
for (int j=0;j<KERNEL_NUMS;j++){
//in this loop,first time kernel launch 30us, the rest 5us. same as scene 1
matMultCUDA <<< blocks_num, THREAD_NUM, 0, stream1 >>>(d_x1 , d_y1 , d_z , n );
}
cudaStreamSynchronize(stream1);
//the second loop
for (int j=0;j<KERNEL_NUMS;j++){
***//each launch in the loop is 5us. perf is different from scene 1***
matMultCUDA <<< blocks_num, THREAD_NUM, 0, stream1 >>>(d_x2 , d_y2 , d_z , n );
}
cudaStreamSynchronize(stream1);
Don’t understand why the location of malloc will affect the cpu time-consuming of kernelLaunch API,Is there any implicit synchronization here?
At the same time, it is observed that the time consumption of launch after malloc will increase in proportion to the size of malloc
ENV: PCIE A100, cuda 11.4
time-consuming tool : nsight system