my test code:
my code refer: cudaLaunchHostFunc blocking work on Linux - #3 by brian.budge
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const unsigned long long my_delay = 1000000ULL;
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void k(unsigned long long delay){
unsigned long long start = clock64();
unsigned long long end = start+delay;
while (clock64() < end);
}
struct d{
unsigned long long start;
} my_parms;
void testHostCallbackPerformance(void *args) {
size_t counter = (size_t) args;
counter++;
return;
}
__global__ void testKernelFuncPerformance(size_t counter){
counter++;
}
void dt_cpu(void *parms){
d *p = (d *) parms;
std::cout << "CB Duration: " << dtime_usec(p->start) << "us" << std::endl;
}
void test(){
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaStream_t s;
cudaStreamCreate(&s);
k<<<1,1, 0, s>>>(my_delay); // warm-up
cudaDeviceSynchronize();
#if defined USE_EVENT
cudaEventRecord(start, s);
#elif defined USE_CB
my_parms.start = dtime_usec(0);
#else
unsigned long long scpu = dtime_usec(0);
#endif
k<<<1,1, 0, s>>>(my_delay);
size_t counter;
#if defined INJECT_HOST_CALLBACK
cudaLaunchHostFunc(s, testHostCallbackPerformance, (void*)counter);
#elif defined INJECT_KERNEL
testKernelFuncPerformance<<<1,1, 0, s>>>(counter);
#endif
#if defined USE_EVENT
cudaEventRecord(stop, s);
cudaEventSynchronize(stop);
float ms;
cudaEventElapsedTime(&ms, start, stop);
std::cout << "EV Duration: " << (ms*1000) << "us" << std::endl;
#elif defined USE_CB
cudaLaunchHostFunc(s, dt_cpu, &my_parms);
#else
cudaDeviceSynchronize();
std::cout << "CPU Duration: " << dtime_usec(scpu) << "us" << std::endl;
#endif
cudaDeviceSynchronize();
cudaStreamDestroy(s);
}
int main() {
for (int i=1;i<10;i++) {
test();
}
return 0;
}
build and run test program:
nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT
[root@ /home/admin/nccl]
#./a.out
EV Duration: 511.456us
EV Duration: 510.176us
EV Duration: 510.016us
EV Duration: 509.92us
EV Duration: 509.792us
EV Duration: 509.888us
EV Duration: 509.888us
EV Duration: 510.848us
EV Duration: 510.592us
[root@ /home/admin/nccl]
#./a.out
EV Duration: 510.848us
EV Duration: 509.728us
EV Duration: 509.664us
EV Duration: 509.696us
EV Duration: 509.728us
EV Duration: 509.632us
EV Duration: 509.664us
EV Duration: 510.048us
EV Duration: 509.888us
[root@gpulingjun033184120168.sa127 /home/admin/nccl]
#./a.out
EV Duration: 511.296us
EV Duration: 510.016us
EV Duration: 509.6us
EV Duration: 509.408us
EV Duration: 509.536us
EV Duration: 509.696us
EV Duration: 509.408us
EV Duration: 510.048us
EV Duration: 509.44us
Insert a host callback function after the current kernel function, resulting in a delay increase of 50 microseconds.
nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT -DINJECT_HOST_CALLBACK
[root@ /home/admin/nccl]
#./a.out
EV Duration: 583.328us
EV Duration: 561.792us
EV Duration: 561.664us
EV Duration: 561.632us
EV Duration: 562.048us
EV Duration: 561.76us
EV Duration: 561.984us
EV Duration: 561.952us
EV Duration: 561.248us
[root@ /home/admin/nccl]
#./a.out
EV Duration: 604.288us
EV Duration: 571.872us
EV Duration: 561.184us
EV Duration: 551.808us
EV Duration: 551.584us
EV Duration: 551.52us
EV Duration: 551.328us
EV Duration: 551.904us
EV Duration: 543.36us
[root@ /home/admin/nccl]
#./a.out
EV Duration: 562.912us
EV Duration: 538.944us
EV Duration: 556.928us
EV Duration: 556.352us
EV Duration: 556.672us
EV Duration: 556.8us
EV Duration: 557.12us
EV Duration: 557.152us
EV Duration: 557.184us
If a very lightweight kernel function is inserted after the current kernel, the latency only increases by about 1 microsecond.
nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT -DINJECT_KERNEL
[root@ /home/admin/nccl]
#./a.out
EV Duration: 518.272us
EV Duration: 511.232us
EV Duration: 511.232us
EV Duration: 511.136us
EV Duration: 511.424us
EV Duration: 511.776us
EV Duration: 511.808us
EV Duration: 511.712us
EV Duration: 511.552us
[root@ /home/admin/nccl]
#./a.out
EV Duration: 517.696us
EV Duration: 511.232us
EV Duration: 511.168us
EV Duration: 511.072us
EV Duration: 511.072us
EV Duration: 511.36us
EV Duration: 511.296us
EV Duration: 511.328us
EV Duration: 511.584us
[root@ /home/admin/nccl]
#./a.out
EV Duration: 518.208us
EV Duration: 511.392us
EV Duration: 510.944us
EV Duration: 511.136us
EV Duration: 511.456us
EV Duration: 511.648us
EV Duration: 511.488us
EV Duration: 511.68us
EV Duration: 511.456us
my test environment:
Thu Sep 12 11:12:00 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA H20 On | 00000000:08:00.0 Off | 0 |
| N/A 32C P0 73W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA H20 On | 00000000:7E:00.0 Off | 0 |
| N/A 30C P0 72W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 2 NVIDIA H20 On | 00000000:A2:00.0 Off | 0 |
| N/A 34C P0 72W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 3 NVIDIA H20 On | 00000000:C6:00.0 Off | 0 |
| N/A 32C P0 74W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 4 NVIDIA H20 On | 00000001:09:00.0 Off | 0 |
| N/A 30C P0 73W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 5 NVIDIA H20 On | 00000001:7F:00.0 Off | 0 |
| N/A 32C P0 73W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 6 NVIDIA H20 On | 00000001:A3:00.0 Off | 0 |
| N/A 34C P0 71W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 7 NVIDIA H20 On | 00000001:C7:00.0 Off | 0 |
| N/A 34C P0 72W / 500W | 0MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
In the training of large models, it is common to encounter a slowdown in training iteration speed. To investigate whether the issue is related to slower collective communication, I added a host callback function after the NCCL collective communication kernel function and printed the execution time of the NCCL kernel in the callback function. In our company’s scenario, we need to regularly log the execution time of this kernel. After adding this feature, I noticed a significant decrease in performance when using nccl-test to observe bus bandwidth performance.
Is there a better performance solution available?
Performance comparison after adding host callback function in NCCL: