cudaMemcpy latency between sequencial calls

Can someone help explain why it. is that when I profile my implementation there is a latency between my memorycopy calls? The major latency is when I call cudaMemcopySymbol after a cudaMemcpy call…
WHat you see bellow is executed by these lines:

 // Allocate GPU memory
    cudaMalloc(&d_prior_odds, num_errors * sizeof(T));
    cudaMalloc(&d_check_to_error, num_edges * sizeof(T));
    cudaMalloc(&d_error_to_check, num_edges * sizeof(T));
    cudaMalloc(&d_i_from_j, num_edges * sizeof(int));
    cudaMalloc(&d_first_j_from_e, num_errors * sizeof(int));
    cudaMalloc(&d_error_degrees, num_errors * sizeof(int)); // error degrees
    cudaMalloc(&d_posterior_odds, num_errors * sizeof(T));

    cudaMemcpy(d_check_to_error, check_to_error, num_edges * sizeof(T), 
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_i_from_j, i_from_j, num_edges * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_first_j_from_e, first_j_from_e, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_error_degrees, error_degrees, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_prior_odds, prior_odds, num_errors * sizeof(T),
               cudaMemcpyHostToDevice);

    cudaMemcpyToSymbol(d_first_i_from_c_const, first_i_from_c,
                       num_checks * sizeof(int));
    cudaMemcpyToSymbol(d_check_degrees_const, check_degrees,
                       num_checks * sizeof(int));

I don’t have an exact answer, but I think we can state or demonstrate a few things.

  1. There is or can be significant CPU/host activity associated with a runtime API call
  2. The nature of cudaMemcpy is that it is a synchronizing activity on the device side, and a blocking activity on the host side. The host CPU thread cannot proceed beyond the point of the cudaMemcpy call until the API function (i.e. the transfer) has completed.

Taking those things into account, if we could get the work issued asynchronously, we might see several performance-related benefits. In order to get to asynchronous work delivery, we will need to use pinned memory, and async-style APIs. Here is a test case:

# cat t376.cu
#include <iostream>

using T = float;
const int num_checks = 4096;
__constant__ int d_first_i_from_c_const[num_checks];
__constant__ int d_check_degrees_const[num_checks];

int main(){
    T *d_prior_odds, *d_check_to_error, *d_error_to_check, *d_posterior_odds;
    int *d_i_from_j, *d_first_j_from_e, *d_error_degrees;
    int num_errors = 1048576;
    int num_edges = num_errors;
#ifndef USE_PINNED
    T *check_to_error = new T[num_edges];
    int *i_from_j = new int[num_edges];
    int *first_j_from_e = new int[num_errors];
    int *error_degrees = new int[num_errors];
    T *prior_odds = new T[num_errors];
    int *first_i_from_c = new int[num_checks];
    int *check_degrees = new int[num_checks];
#else
    T *check_to_error, *prior_odds;
    int *i_from_j, *first_j_from_e, *error_degrees, *first_i_from_c, *check_degrees;
    cudaHostAlloc(&check_to_error, num_edges*sizeof(T), cudaHostAllocDefault);
    cudaHostAlloc(&i_from_j, num_edges*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&first_j_from_e, num_errors*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&error_degrees, num_errors*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&prior_odds, num_errors*sizeof(T), cudaHostAllocDefault);
    cudaHostAlloc(&first_i_from_c, num_checks*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&check_degrees, num_checks*sizeof(int), cudaHostAllocDefault);
#endif
 // Allocate GPU memory
    cudaMalloc(&d_prior_odds, num_errors * sizeof(T));
    cudaMalloc(&d_check_to_error, num_edges * sizeof(T));
    cudaMalloc(&d_error_to_check, num_edges * sizeof(T));
    cudaMalloc(&d_i_from_j, num_edges * sizeof(int));
    cudaMalloc(&d_first_j_from_e, num_errors * sizeof(int));
    cudaMalloc(&d_error_degrees, num_errors * sizeof(int)); // error degrees
    cudaMalloc(&d_posterior_odds, num_errors * sizeof(T));
#ifndef USE_ASYNC
    cudaMemcpy(d_check_to_error, check_to_error, num_edges * sizeof(T),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_i_from_j, i_from_j, num_edges * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_first_j_from_e, first_j_from_e, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_error_degrees, error_degrees, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_prior_odds, prior_odds, num_errors * sizeof(T),
               cudaMemcpyHostToDevice);

    cudaMemcpyToSymbol(d_first_i_from_c_const, first_i_from_c,
                       num_checks * sizeof(int));
    cudaMemcpyToSymbol(d_check_degrees_const, check_degrees,
                       num_checks * sizeof(int));
#else
    cudaMemcpyAsync(d_check_to_error, check_to_error, num_edges * sizeof(T),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_i_from_j, i_from_j, num_edges * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_first_j_from_e, first_j_from_e, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_error_degrees, error_degrees, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_prior_odds, prior_odds, num_errors * sizeof(T),
               cudaMemcpyHostToDevice);

    cudaMemcpyToSymbolAsync(d_first_i_from_c_const, first_i_from_c,
                       num_checks * sizeof(int));
    cudaMemcpyToSymbolAsync(d_check_degrees_const, check_degrees,
                       num_checks * sizeof(int));
#endif
}
# nvcc -o t376 t376.cu
# nsys nvprof --print-gpu-trace ./t376
WARNING: t376 and any of its children processes will be profiled.

Generating '/tmp/nsys-report-1652.qdstrm'
[1/3] [========================100%] report60.nsys-rep
[2/3] [========================100%] report60.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm         Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ------------------
 659,017,735      1,398,721     125                                                                                4.194          2,994.733  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 660,843,944      1,392,738     126                                                                                4.194          3,011.510  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 662,661,578      1,402,338     127                                                                                4.194          2,990.539  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 664,490,476      1,392,961     128                                                                                4.194          3,007.316  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 666,308,334      1,414,337     129                                                                                4.194          2,965.373  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 668,020,815          1,920     130                                                                                0.016          8,533.328  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 668,040,495          2,048     131                                                                                0.016          7,999.996  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]

Generated:
    /root/bobc/report60.nsys-rep
    /root/bobc/report60.sqlite
# nvcc -o t376 t376.cu -DUSE_PINNED
# nsys nvprof --print-gpu-trace ./t376
WARNING: t376 and any of its children processes will be profiled.

Generating '/tmp/nsys-report-3047.qdstrm'
[1/3] [========================100%] report61.nsys-rep
[2/3] [========================100%] report61.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm         Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ------------------
 634,959,942        489,504     132                                                                                4.194          8,564.769  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 635,468,070        486,368     133                                                                                4.194          8,623.489  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 635,964,230        479,809     134                                                                                4.194          8,740.930  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 636,453,127        477,696     135                                                                                4.194          8,778.678  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 636,939,527        477,857     136                                                                                4.194          8,774.484  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 637,873,928          1,920     137                                                                                0.016          8,533.328  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 637,895,080          1,920     138                                                                                0.016          8,533.328  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]

Generated:
    /root/bobc/report61.nsys-rep
    /root/bobc/report61.sqlite
# nvcc -o t376 t376.cu -DUSE_PINNED -DUSE_ASYNC
# nsys nvprof --print-gpu-trace ./t376
WARNING: t376 and any of its children processes will be profiled.

Generating '/tmp/nsys-report-d9e9.qdstrm'
[1/3] [========================100%] report62.nsys-rep
[2/3] [========================100%] report62.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm         Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ------------------
 738,039,940        341,600     132                                                                                4.194         12,276.728  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 738,382,693        340,736     133                                                                                4.194         12,306.088  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 738,724,549        340,608     134                                                                                4.194         12,310.282  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,066,277        344,288     135                                                                                4.194         12,180.259  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,411,557        340,481     136                                                                                4.194         12,318.671  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,829,318          1,632     137                                                                                0.016         10,039.214  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,841,126          1,632     138                                                                                0.016         10,039.214  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]

Generated:
    /root/bobc/report62.nsys-rep
    /root/bobc/report62.sqlite
#

(CUDA 12.2, L4 GPU)

The first test, with no compile switches, is roughly the case you provided. Sure enough we see that there are “large” gaps in between the actual transfer activity associated with the API calls. At the moment we don’t have many clues about those gaps.

The second test introduces pinned memory. This shortens the duration of the transfers, but also for a number of the transfers, it significantly reduces the gap from the end of one transfer to the start of the next. But the gap from the last cudaMemcpy to the first cudaMemcpyToSymbol is still quite long.

The third test introduces async calls. This means that the CPU thread no longer has to wait on a particular API call for the transfer to complete; it can “race ahead” and issue the work, perhaps allowing the CPU thread to get some of the “overhead” associated with launching these operations “out of the way”, or “amortized” during the “idle” time previously associated with waiting for a particular transfer to complete. In these case we see all gaps reduced, compared to the previous cases.

Here are the tabulated results:

Case1 - Original
start duration end gap
659,017,735 1,398,721 660,416,456
660,843,944 1,392,738 662,236,682 427,488
662,661,578 1,402,338 664,063,916 424,896
664,490,476 1,392,961 665,883,437 426,560
666,308,334 1,414,337 667,722,671 424,897
668,020,815 1,920 668,022,735 298,144
668,040,495 2,048 668,042,543 17,760
Case2 - with pinned memory
start duration end gap
634,959,942 489,504 635,449,446
635,468,070 486,368 635,954,438 18,624
635,964,230 479,809 636,444,039 9,792
636,453,127 477,696 636,930,823 9,088
636,939,527 477,857 637,417,384 8,704
637,873,928 1,920 637,875,848 456,544
637,895,080 1,920 637,897,000 19,232
Case3 - with pinned memory and async
start duration end gap
738,039,940 341,600 738,381,540
738,382,693 340,736 738,723,429 1,153
738,724,549 340,608 739,065,157 1,120
739,066,277 344,288 739,410,565 1,120
739,411,557 340,481 739,752,038 992
739,829,318 1,632 739,830,950 77,280
739,841,126 1,632 739,842,758 10,176

Here are some additional questions I’m not able to answer:

  1. Why does the transition from cudaMemcpyAsync to cudaMemcpyToSymbolAsync still show a relative large gap compared to the other transfers, even in the 3rd test case?
  2. When using pinned memory, why does switching from non-async API variants to async API variants result in faster transfer rate (shorter duration)?

To go back to providing a response to the original question: It seems that there is something about pageable transfers and also something about the blocking nature of the API being used that results in increased gaps between actual transfer activity, when cudaMemcpy-style calls are issued “back to back”.

It also stands to reason that if the host CPU “matters”, then:

  • a faster host CPU may help
  • a heavily loaded host CPU may degrade the results

In my test case, I can state that I am the only user of this machine under test, so I believe that the host CPU is not heavily loaded by unrelated tasks. Its not a particularly fast CPU (Xeon E5-2630L v3, 1.80GHz), however. This is also a rather ancient server so I believe it is only PCIE Gen3 capable; so don’t read too much into the indicated transfer rates; I don’t view that as the focus of the original question. (The L4 GPU I’m using is a gen4 capable GPU, but the server is not.)

One of the things that is mentioned about pageable transfers is that they generally proceed by copying the data chunk-by-chunk to a pinned buffer, and then executing the transfer via potentially a sequence of transfers from the pinned buffer. I believe this general mechanism is part of the explanation for a lower transfer rate observed with pageable transfers vs. pinned transfers, but we could also speculate that the additional buffer handling process could possibly also impact latency, and therefore the “gap” between one pageable transfer and the next. This article mentions this mechanism and may also be a good resource for the general topic here.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.