Why is the performance of cudaLaunchHostFunc low, and what are the optimization strategies or alternative solutions?

fujianhao1997 · September 12, 2024, 6:00am

my test code:
my code refer: cudaLaunchHostFunc blocking work on Linux - #3 by brian.budge

#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

const unsigned long long my_delay = 1000000ULL;

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void k(unsigned long long delay){

  unsigned long long start = clock64();
  unsigned long long end = start+delay;
  while (clock64() < end);
}

struct d{
  unsigned long long start;
} my_parms;

void testHostCallbackPerformance(void *args) {
    size_t counter = (size_t) args;
    counter++;
    return;
}

__global__ void testKernelFuncPerformance(size_t counter){
    counter++;
}

void dt_cpu(void *parms){
  d *p = (d *) parms;
  std::cout << "CB Duration: " << dtime_usec(p->start) << "us" << std::endl;
}

void test(){
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaStream_t s;
  cudaStreamCreate(&s);
  k<<<1,1, 0, s>>>(my_delay); // warm-up
  cudaDeviceSynchronize();
#if defined USE_EVENT
  cudaEventRecord(start, s);
#elif defined USE_CB
  my_parms.start = dtime_usec(0);
#else
  unsigned long long scpu = dtime_usec(0);
#endif
  k<<<1,1, 0, s>>>(my_delay);
  size_t counter;
#if defined INJECT_HOST_CALLBACK
    cudaLaunchHostFunc(s, testHostCallbackPerformance, (void*)counter);
#elif defined INJECT_KERNEL
    testKernelFuncPerformance<<<1,1, 0, s>>>(counter);
#endif
  
#if defined USE_EVENT
  cudaEventRecord(stop, s);
  cudaEventSynchronize(stop);
  float ms;
  cudaEventElapsedTime(&ms, start, stop);
  std::cout << "EV Duration: " << (ms*1000) << "us" << std::endl;
#elif defined USE_CB
  cudaLaunchHostFunc(s, dt_cpu, &my_parms);
#else
  cudaDeviceSynchronize();
  std::cout << "CPU Duration: " << dtime_usec(scpu) << "us" << std::endl;
#endif
  cudaDeviceSynchronize();
  cudaStreamDestroy(s);
}

int main() {
    for (int i=1;i<10;i++) {
        test();
    }
    return 0;
}

build and run test program:

nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 511.456us
EV Duration: 510.176us
EV Duration: 510.016us
EV Duration: 509.92us
EV Duration: 509.792us
EV Duration: 509.888us
EV Duration: 509.888us
EV Duration: 510.848us
EV Duration: 510.592us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 510.848us
EV Duration: 509.728us
EV Duration: 509.664us
EV Duration: 509.696us
EV Duration: 509.728us
EV Duration: 509.632us
EV Duration: 509.664us
EV Duration: 510.048us
EV Duration: 509.888us

[root@gpulingjun033184120168.sa127 /home/admin/nccl]
#./a.out 
EV Duration: 511.296us
EV Duration: 510.016us
EV Duration: 509.6us
EV Duration: 509.408us
EV Duration: 509.536us
EV Duration: 509.696us
EV Duration: 509.408us
EV Duration: 510.048us
EV Duration: 509.44us

Insert a host callback function after the current kernel function, resulting in a delay increase of 50 microseconds.

nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT -DINJECT_HOST_CALLBACK

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 583.328us
EV Duration: 561.792us
EV Duration: 561.664us
EV Duration: 561.632us
EV Duration: 562.048us
EV Duration: 561.76us
EV Duration: 561.984us
EV Duration: 561.952us
EV Duration: 561.248us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 604.288us
EV Duration: 571.872us
EV Duration: 561.184us
EV Duration: 551.808us
EV Duration: 551.584us
EV Duration: 551.52us
EV Duration: 551.328us
EV Duration: 551.904us
EV Duration: 543.36us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 562.912us
EV Duration: 538.944us
EV Duration: 556.928us
EV Duration: 556.352us
EV Duration: 556.672us
EV Duration: 556.8us
EV Duration: 557.12us
EV Duration: 557.152us
EV Duration: 557.184us

If a very lightweight kernel function is inserted after the current kernel, the latency only increases by about 1 microsecond.

nvcc ./cudaLaunchHostFunc_test.cu -DUSE_EVENT -DINJECT_KERNEL

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 518.272us
EV Duration: 511.232us
EV Duration: 511.232us
EV Duration: 511.136us
EV Duration: 511.424us
EV Duration: 511.776us
EV Duration: 511.808us
EV Duration: 511.712us
EV Duration: 511.552us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 517.696us
EV Duration: 511.232us
EV Duration: 511.168us
EV Duration: 511.072us
EV Duration: 511.072us
EV Duration: 511.36us
EV Duration: 511.296us
EV Duration: 511.328us
EV Duration: 511.584us

[root@ /home/admin/nccl]
#./a.out 
EV Duration: 518.208us
EV Duration: 511.392us
EV Duration: 510.944us
EV Duration: 511.136us
EV Duration: 511.456us
EV Duration: 511.648us
EV Duration: 511.488us
EV Duration: 511.68us
EV Duration: 511.456us

my test environment:

Thu Sep 12 11:12:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA H20                     On  | 00000000:08:00.0 Off |                    0 |
| N/A   32C    P0              73W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H20                     On  | 00000000:7E:00.0 Off |                    0 |
| N/A   30C    P0              72W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA H20                     On  | 00000000:A2:00.0 Off |                    0 |
| N/A   34C    P0              72W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA H20                     On  | 00000000:C6:00.0 Off |                    0 |
| N/A   32C    P0              74W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   4  NVIDIA H20                     On  | 00000001:09:00.0 Off |                    0 |
| N/A   30C    P0              73W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   5  NVIDIA H20                     On  | 00000001:7F:00.0 Off |                    0 |
| N/A   32C    P0              73W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   6  NVIDIA H20                     On  | 00000001:A3:00.0 Off |                    0 |
| N/A   34C    P0              71W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   7  NVIDIA H20                     On  | 00000001:C7:00.0 Off |                    0 |
| N/A   34C    P0              72W / 500W |      0MiB / 97871MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

In the training of large models, it is common to encounter a slowdown in training iteration speed. To investigate whether the issue is related to slower collective communication, I added a host callback function after the NCCL collective communication kernel function and printed the execution time of the NCCL kernel in the callback function. In our company’s scenario, we need to regularly log the execution time of this kernel. After adding this feature, I noticed a significant decrease in performance when using nccl-test to observe bus bandwidth performance.

Is there a better performance solution available?

Performance comparison after adding host callback function in NCCL：

striker159 · September 12, 2024, 4:46pm

In your posted code the measured timing of a kernel will always be shorter than measuring the elapsed time of kernel execution + delay until host callback starts + callback execution. This should be no surprise.

Why do you need a host callback to print the kernel timings in the real application?

fujianhao1997 · September 13, 2024, 2:27pm

In the context of large model training, it is common to encounter training hangs and slow training speeds, which can generally be categorized into two issues: computation slowdowns and collective communication slowdowns. To enhance the observability of NCCL, we plan to log timestamps at the beginning and end of collective communication, as well as print the time consumed by the collective communication. Currently, it seems that cudaLaunchHostFunc is a relatively convenient approach. However, this method does not appear to be very efficient; are there any better alternatives?

striker159 · September 13, 2024, 2:45pm

Let me rephrase my question.
Is it not possible to record timestamps and/ or cuda events from the same thread that performs the nccl calls?

Robert_Crovella · September 13, 2024, 2:51pm

I don’t know if its a “better” alternative, because it depends on what your usage needs are. There are other possibilities, but they do not emulate exactly what the callback methodology does/provides.

recent versions of nccl have nvtx markers built-in. In addition, recent versions of nsight systems can automatically trace nccl activity as a feature.

fujianhao1997 · September 14, 2024, 1:53am

Yes, because the thread submitting the NCCL kernel is the user’s own thread, we cannot block this thread.

fujianhao1997 · September 14, 2024, 1:58am

I don’t know much about nsight, but based on my understanding, it cannot be used as a regular monitoring tool, and its resource consumption is quite high. In my current situation, I need to continuously monitor during large model training without consuming too many resources, rather than trying to identify the root causes of issues after already noticing slowdowns or other anomalies.

striker159 · September 14, 2024, 5:03am

Okay, so I assume you are using asynchronous NCCL collectives and that is why you don’t want to block the thread.

You could use a pool of pairs of cudaEvents. Get a pair from the pool, record event before and after NCCL, then pass the pair to a separate thread (not a cuda callback thread). In the separate thread perform cudaEventSynchronize and cudaEventElapsedTime and print the time. Then return the event pair to the pool.

fujianhao1997 · September 14, 2024, 5:54am

Thank you for your suggestion. I will test this solution. I will reply here once I have results.

Topic		Replies	Views
CUDA kernel for-loop performance CUDA Programming and Performance	16	6366	September 7, 2019
massive hiccups when transferring flags back to host challenging the commonly quoted 2-10us latency CUDA Programming and Performance	14	38117	February 1, 2011
Code hangs... CUDA Programming and Performance	24	19895	August 18, 2010
why cudaGetDeviceProperties and cudaMallocPitch consume a lot of time CUDA Programming and Performance	18	2358	January 9, 2017
The Cuda 5 Second execution-time limit Finding a the way to work around the GDI timeout CUDA Programming and Performance	24	12715	July 26, 2010
Silent kernel failure CUDA Programming and Performance	25	8286	May 18, 2020
reduce overhead of launching a new thread block CUDA Programming and Performance	15	4609	February 15, 2018
Performance drops with dynamic parallelism CUDA Programming and Performance cuda , dynamic-control	12	589	June 3, 2024
CUDA very slow performance CUDA Programming and Performance	21	16659	March 6, 2020
Why does cudaStreamAddCallback serialize kernel execution and break concurrency? CUDA Programming and Performance	12	8039	April 5, 2015

Why is the performance of cudaLaunchHostFunc low, and what are the optimization strategies or alternative solutions?

Related topics