Why is the performance of cudaLaunchHostFunc low, and what are the optimization strategies or alternative solutions?

my test code:
my code refer: cudaLaunchHostFunc blocking work on Linux - #3 by brian.budge

#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

const unsigned long long my_delay = 1000000ULL;

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void k(unsigned long long delay){

  unsigned long long start = clock64();
  unsigned long long end = start+delay;
  while (clock64() < end);
}

struct d{
  unsigned long long start;
} my_parms;

void testHostCallbackPerformance(void *args) {
    size_t counter = (size_t) args;
    counter++;
    return;
}

__global__ void testKernelFuncPerformance(size_t counter){
    counter++;
}

void dt_cpu(void *parms){
  d *p = (d *) parms;
  std::cout << "CB Duration: " << dtime_usec(p->start) << "us" << std::endl;
}

void test(){
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaStream_t s;
  cudaStreamCreate(&s);
  k<<<1,1, 0, s>>>(my_delay); // warm-up
  cudaDeviceSynchronize();
#if defined USE_EVENT
  cudaEventRecord(start, s);
#elif defined USE_CB
  my_parms.start = dtime_usec(0);
#else
  unsigned long long scpu = dtime_usec(0);
#endif
  k<<<1,1, 0, s>>>(my_delay);
  size_t counter;
#if defined INJECT_HOST_CALLBACK
    cudaLaunchHostFunc(s, testHostCallbackPerformance, (void*)counter);
#elif defined INJECT_KERNEL
    testKernelFuncPerformance<<<1,1, 0, s>>>(counter);
#endif
  
#if defined USE_EVENT
  cudaEventRecord(stop, s);
  cudaEventSynchronize(stop);
  float ms;
  cudaEventElapsedTime(&ms, start, stop);
  std::cout << "EV Duration: " << (ms*1000) << "us" << std::endl;
#elif defined USE_CB
  cudaLaunchHostFunc(s, dt_cpu, &my_parms);
#else
  cudaDeviceSynchronize();
  std::cout << "CPU Duration: " << dtime_usec(scpu) << "us" << std::endl;
#endif
  cudaDeviceSynchronize();
  cudaStreamDestroy(s);
}

int main() {
    for (int i=1;i<10;i++) {
        test();
    }
    return 0;
}

build and run test program:

nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 511.456us
EV Duration: 510.176us
EV Duration: 510.016us
EV Duration: 509.92us
EV Duration: 509.792us
EV Duration: 509.888us
EV Duration: 509.888us
EV Duration: 510.848us
EV Duration: 510.592us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 510.848us
EV Duration: 509.728us
EV Duration: 509.664us
EV Duration: 509.696us
EV Duration: 509.728us
EV Duration: 509.632us
EV Duration: 509.664us
EV Duration: 510.048us
EV Duration: 509.888us

[root@gpulingjun033184120168.sa127 /home/admin/nccl]
#./a.out 
EV Duration: 511.296us
EV Duration: 510.016us
EV Duration: 509.6us
EV Duration: 509.408us
EV Duration: 509.536us
EV Duration: 509.696us
EV Duration: 509.408us
EV Duration: 510.048us
EV Duration: 509.44us

Insert a host callback function after the current kernel function, resulting in a delay increase of 50 microseconds.

nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT -DINJECT_HOST_CALLBACK

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 583.328us
EV Duration: 561.792us
EV Duration: 561.664us
EV Duration: 561.632us
EV Duration: 562.048us
EV Duration: 561.76us
EV Duration: 561.984us
EV Duration: 561.952us
EV Duration: 561.248us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 604.288us
EV Duration: 571.872us
EV Duration: 561.184us
EV Duration: 551.808us
EV Duration: 551.584us
EV Duration: 551.52us
EV Duration: 551.328us
EV Duration: 551.904us
EV Duration: 543.36us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 562.912us
EV Duration: 538.944us
EV Duration: 556.928us
EV Duration: 556.352us
EV Duration: 556.672us
EV Duration: 556.8us
EV Duration: 557.12us
EV Duration: 557.152us
EV Duration: 557.184us

If a very lightweight kernel function is inserted after the current kernel, the latency only increases by about 1 microsecond.

nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT -DINJECT_KERNEL

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 518.272us
EV Duration: 511.232us
EV Duration: 511.232us
EV Duration: 511.136us
EV Duration: 511.424us
EV Duration: 511.776us
EV Duration: 511.808us
EV Duration: 511.712us
EV Duration: 511.552us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 517.696us
EV Duration: 511.232us
EV Duration: 511.168us
EV Duration: 511.072us
EV Duration: 511.072us
EV Duration: 511.36us
EV Duration: 511.296us
EV Duration: 511.328us
EV Duration: 511.584us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 518.208us
EV Duration: 511.392us
EV Duration: 510.944us
EV Duration: 511.136us
EV Duration: 511.456us
EV Duration: 511.648us
EV Duration: 511.488us
EV Duration: 511.68us
EV Duration: 511.456us

my test environment:

Thu Sep 12 11:12:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA H20                     On  | 00000000:08:00.0 Off |                    0 |
| N/A   32C    P0              73W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H20                     On  | 00000000:7E:00.0 Off |                    0 |
| N/A   30C    P0              72W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA H20                     On  | 00000000:A2:00.0 Off |                    0 |
| N/A   34C    P0              72W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA H20                     On  | 00000000:C6:00.0 Off |                    0 |
| N/A   32C    P0              74W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   4  NVIDIA H20                     On  | 00000001:09:00.0 Off |                    0 |
| N/A   30C    P0              73W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   5  NVIDIA H20                     On  | 00000001:7F:00.0 Off |                    0 |
| N/A   32C    P0              73W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   6  NVIDIA H20                     On  | 00000001:A3:00.0 Off |                    0 |
| N/A   34C    P0              71W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   7  NVIDIA H20                     On  | 00000001:C7:00.0 Off |                    0 |
| N/A   34C    P0              72W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

In the training of large models, it is common to encounter a slowdown in training iteration speed. To investigate whether the issue is related to slower collective communication, I added a host callback function after the NCCL collective communication kernel function and printed the execution time of the NCCL kernel in the callback function. In our company’s scenario, we need to regularly log the execution time of this kernel. After adding this feature, I noticed a significant decrease in performance when using nccl-test to observe bus bandwidth performance.

Is there a better performance solution available?

Performance comparison after adding host callback function in NCCL:

In your posted code the measured timing of a kernel will always be shorter than measuring the elapsed time of kernel execution + delay until host callback starts + callback execution. This should be no surprise.

Why do you need a host callback to print the kernel timings in the real application?

In the context of large model training, it is common to encounter training hangs and slow training speeds, which can generally be categorized into two issues: computation slowdowns and collective communication slowdowns. To enhance the observability of NCCL, we plan to log timestamps at the beginning and end of collective communication, as well as print the time consumed by the collective communication. Currently, it seems that cudaLaunchHostFunc is a relatively convenient approach. However, this method does not appear to be very efficient; are there any better alternatives?

Let me rephrase my question.
Is it not possible to record timestamps and/ or cuda events from the same thread that performs the nccl calls?

I don’t know if its a “better” alternative, because it depends on what your usage needs are. There are other possibilities, but they do not emulate exactly what the callback methodology does/provides.

recent versions of nccl have nvtx markers built-in. In addition, recent versions of nsight systems can automatically trace nccl activity as a feature.

Yes, because the thread submitting the NCCL kernel is the user’s own thread, we cannot block this thread.

I don’t know much about nsight, but based on my understanding, it cannot be used as a regular monitoring tool, and its resource consumption is quite high. In my current situation, I need to continuously monitor during large model training without consuming too many resources, rather than trying to identify the root causes of issues after already noticing slowdowns or other anomalies.

Okay, so I assume you are using asynchronous NCCL collectives and that is why you don’t want to block the thread.

You could use a pool of pairs of cudaEvents. Get a pair from the pool, record event before and after NCCL, then pass the pair to a separate thread (not a cuda callback thread). In the separate thread perform cudaEventSynchronize and cudaEventElapsedTime and print the time. Then return the event pair to the pool.

Thank you for your suggestion. I will test this solution. I will reply here once I have results.