cudaMemcpy latency between sequencial calls

jules.gransden · May 1, 2025, 9:41am

Can someone help explain why it. is that when I profile my implementation there is a latency between my memorycopy calls? The major latency is when I call cudaMemcopySymbol after a cudaMemcpy call…
WHat you see bellow is executed by these lines:

 // Allocate GPU memory
    cudaMalloc(&d_prior_odds, num_errors * sizeof(T));
    cudaMalloc(&d_check_to_error, num_edges * sizeof(T));
    cudaMalloc(&d_error_to_check, num_edges * sizeof(T));
    cudaMalloc(&d_i_from_j, num_edges * sizeof(int));
    cudaMalloc(&d_first_j_from_e, num_errors * sizeof(int));
    cudaMalloc(&d_error_degrees, num_errors * sizeof(int)); // error degrees
    cudaMalloc(&d_posterior_odds, num_errors * sizeof(T));

    cudaMemcpy(d_check_to_error, check_to_error, num_edges * sizeof(T), 
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_i_from_j, i_from_j, num_edges * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_first_j_from_e, first_j_from_e, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_error_degrees, error_degrees, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_prior_odds, prior_odds, num_errors * sizeof(T),
               cudaMemcpyHostToDevice);

    cudaMemcpyToSymbol(d_first_i_from_c_const, first_i_from_c,
                       num_checks * sizeof(int));
    cudaMemcpyToSymbol(d_check_degrees_const, check_degrees,
                       num_checks * sizeof(int));

Robert_Crovella · May 1, 2025, 3:20pm

I don’t have an exact answer, but I think we can state or demonstrate a few things.

There is or can be significant CPU/host activity associated with a runtime API call
The nature of cudaMemcpy is that it is a synchronizing activity on the device side, and a blocking activity on the host side. The host CPU thread cannot proceed beyond the point of the cudaMemcpy call until the API function (i.e. the transfer) has completed.

Taking those things into account, if we could get the work issued asynchronously, we might see several performance-related benefits. In order to get to asynchronous work delivery, we will need to use pinned memory, and async-style APIs. Here is a test case:

# cat t376.cu
#include <iostream>

using T = float;
const int num_checks = 4096;
__constant__ int d_first_i_from_c_const[num_checks];
__constant__ int d_check_degrees_const[num_checks];

int main(){
    T *d_prior_odds, *d_check_to_error, *d_error_to_check, *d_posterior_odds;
    int *d_i_from_j, *d_first_j_from_e, *d_error_degrees;
    int num_errors = 1048576;
    int num_edges = num_errors;
#ifndef USE_PINNED
    T *check_to_error = new T[num_edges];
    int *i_from_j = new int[num_edges];
    int *first_j_from_e = new int[num_errors];
    int *error_degrees = new int[num_errors];
    T *prior_odds = new T[num_errors];
    int *first_i_from_c = new int[num_checks];
    int *check_degrees = new int[num_checks];
#else
    T *check_to_error, *prior_odds;
    int *i_from_j, *first_j_from_e, *error_degrees, *first_i_from_c, *check_degrees;
    cudaHostAlloc(&check_to_error, num_edges*sizeof(T), cudaHostAllocDefault);
    cudaHostAlloc(&i_from_j, num_edges*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&first_j_from_e, num_errors*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&error_degrees, num_errors*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&prior_odds, num_errors*sizeof(T), cudaHostAllocDefault);
    cudaHostAlloc(&first_i_from_c, num_checks*sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc(&check_degrees, num_checks*sizeof(int), cudaHostAllocDefault);
#endif
 // Allocate GPU memory
    cudaMalloc(&d_prior_odds, num_errors * sizeof(T));
    cudaMalloc(&d_check_to_error, num_edges * sizeof(T));
    cudaMalloc(&d_error_to_check, num_edges * sizeof(T));
    cudaMalloc(&d_i_from_j, num_edges * sizeof(int));
    cudaMalloc(&d_first_j_from_e, num_errors * sizeof(int));
    cudaMalloc(&d_error_degrees, num_errors * sizeof(int)); // error degrees
    cudaMalloc(&d_posterior_odds, num_errors * sizeof(T));
#ifndef USE_ASYNC
    cudaMemcpy(d_check_to_error, check_to_error, num_edges * sizeof(T),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_i_from_j, i_from_j, num_edges * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_first_j_from_e, first_j_from_e, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_error_degrees, error_degrees, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpy(d_prior_odds, prior_odds, num_errors * sizeof(T),
               cudaMemcpyHostToDevice);

    cudaMemcpyToSymbol(d_first_i_from_c_const, first_i_from_c,
                       num_checks * sizeof(int));
    cudaMemcpyToSymbol(d_check_degrees_const, check_degrees,
                       num_checks * sizeof(int));
#else
    cudaMemcpyAsync(d_check_to_error, check_to_error, num_edges * sizeof(T),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_i_from_j, i_from_j, num_edges * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_first_j_from_e, first_j_from_e, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_error_degrees, error_degrees, num_errors * sizeof(int),
               cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_prior_odds, prior_odds, num_errors * sizeof(T),
               cudaMemcpyHostToDevice);

    cudaMemcpyToSymbolAsync(d_first_i_from_c_const, first_i_from_c,
                       num_checks * sizeof(int));
    cudaMemcpyToSymbolAsync(d_check_degrees_const, check_degrees,
                       num_checks * sizeof(int));
#endif
}
# nvcc -o t376 t376.cu
# nsys nvprof --print-gpu-trace ./t376
WARNING: t376 and any of its children processes will be profiled.

Generating '/tmp/nsys-report-1652.qdstrm'
[1/3] [========================100%] report60.nsys-rep
[2/3] [========================100%] report60.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm         Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ------------------
 659,017,735      1,398,721     125                                                                                4.194          2,994.733  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 660,843,944      1,392,738     126                                                                                4.194          3,011.510  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 662,661,578      1,402,338     127                                                                                4.194          2,990.539  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 664,490,476      1,392,961     128                                                                                4.194          3,007.316  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 666,308,334      1,414,337     129                                                                                4.194          2,965.373  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 668,020,815          1,920     130                                                                                0.016          8,533.328  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 668,040,495          2,048     131                                                                                0.016          7,999.996  Pageable  Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]

Generated:
    /root/bobc/report60.nsys-rep
    /root/bobc/report60.sqlite
# nvcc -o t376 t376.cu -DUSE_PINNED
# nsys nvprof --print-gpu-trace ./t376
WARNING: t376 and any of its children processes will be profiled.

Generating '/tmp/nsys-report-3047.qdstrm'
[1/3] [========================100%] report61.nsys-rep
[2/3] [========================100%] report61.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm         Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ------------------
 634,959,942        489,504     132                                                                                4.194          8,564.769  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 635,468,070        486,368     133                                                                                4.194          8,623.489  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 635,964,230        479,809     134                                                                                4.194          8,740.930  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 636,453,127        477,696     135                                                                                4.194          8,778.678  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 636,939,527        477,857     136                                                                                4.194          8,774.484  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 637,873,928          1,920     137                                                                                0.016          8,533.328  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 637,895,080          1,920     138                                                                                0.016          8,533.328  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]

Generated:
    /root/bobc/report61.nsys-rep
    /root/bobc/report61.sqlite
# nvcc -o t376 t376.cu -DUSE_PINNED -DUSE_ASYNC
# nsys nvprof --print-gpu-trace ./t376
WARNING: t376 and any of its children processes will be profiled.

Generating '/tmp/nsys-report-d9e9.qdstrm'
[1/3] [========================100%] report62.nsys-rep
[2/3] [========================100%] report62.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm         Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ------------------
 738,039,940        341,600     132                                                                                4.194         12,276.728  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 738,382,693        340,736     133                                                                                4.194         12,306.088  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 738,724,549        340,608     134                                                                                4.194         12,310.282  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,066,277        344,288     135                                                                                4.194         12,180.259  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,411,557        340,481     136                                                                                4.194         12,318.671  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,829,318          1,632     137                                                                                0.016         10,039.214  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]
 739,841,126          1,632     138                                                                                0.016         10,039.214  Pinned    Device    NVIDIA L4 (0)    1     7  [CUDA memcpy HtoD]

Generated:
    /root/bobc/report62.nsys-rep
    /root/bobc/report62.sqlite
#

(CUDA 12.2, L4 GPU)

The first test, with no compile switches, is roughly the case you provided. Sure enough we see that there are “large” gaps in between the actual transfer activity associated with the API calls. At the moment we don’t have many clues about those gaps.

The second test introduces pinned memory. This shortens the duration of the transfers, but also for a number of the transfers, it significantly reduces the gap from the end of one transfer to the start of the next. But the gap from the last cudaMemcpy to the first cudaMemcpyToSymbol is still quite long.

The third test introduces async calls. This means that the CPU thread no longer has to wait on a particular API call for the transfer to complete; it can “race ahead” and issue the work, perhaps allowing the CPU thread to get some of the “overhead” associated with launching these operations “out of the way”, or “amortized” during the “idle” time previously associated with waiting for a particular transfer to complete. In these case we see all gaps reduced, compared to the previous cases.

Here are the tabulated results:

Case1 - Original
start	duration	end	gap
659,017,735	1,398,721	660,416,456
660,843,944	1,392,738	662,236,682	427,488
662,661,578	1,402,338	664,063,916	424,896
664,490,476	1,392,961	665,883,437	426,560
666,308,334	1,414,337	667,722,671	424,897
668,020,815	1,920	668,022,735	298,144
668,040,495	2,048	668,042,543	17,760

Case2 - with pinned memory
start	duration	end	gap
634,959,942	489,504	635,449,446
635,468,070	486,368	635,954,438	18,624
635,964,230	479,809	636,444,039	9,792
636,453,127	477,696	636,930,823	9,088
636,939,527	477,857	637,417,384	8,704
637,873,928	1,920	637,875,848	456,544
637,895,080	1,920	637,897,000	19,232

Case3 - with pinned memory and async
start	duration	end	gap
738,039,940	341,600	738,381,540
738,382,693	340,736	738,723,429	1,153
738,724,549	340,608	739,065,157	1,120
739,066,277	344,288	739,410,565	1,120
739,411,557	340,481	739,752,038	992
739,829,318	1,632	739,830,950	77,280
739,841,126	1,632	739,842,758	10,176

Here are some additional questions I’m not able to answer:

Why does the transition from cudaMemcpyAsync to cudaMemcpyToSymbolAsync still show a relative large gap compared to the other transfers, even in the 3rd test case?
When using pinned memory, why does switching from non-async API variants to async API variants result in faster transfer rate (shorter duration)?

To go back to providing a response to the original question: It seems that there is something about pageable transfers and also something about the blocking nature of the API being used that results in increased gaps between actual transfer activity, when cudaMemcpy-style calls are issued “back to back”.

It also stands to reason that if the host CPU “matters”, then:

a faster host CPU may help
a heavily loaded host CPU may degrade the results

In my test case, I can state that I am the only user of this machine under test, so I believe that the host CPU is not heavily loaded by unrelated tasks. Its not a particularly fast CPU (Xeon E5-2630L v3, 1.80GHz), however. This is also a rather ancient server so I believe it is only PCIE Gen3 capable; so don’t read too much into the indicated transfer rates; I don’t view that as the focus of the original question. (The L4 GPU I’m using is a gen4 capable GPU, but the server is not.)

One of the things that is mentioned about pageable transfers is that they generally proceed by copying the data chunk-by-chunk to a pinned buffer, and then executing the transfer via potentially a sequence of transfers from the pinned buffer. I believe this general mechanism is part of the explanation for a lower transfer rate observed with pageable transfers vs. pinned transfers, but we could also speculate that the additional buffer handling process could possibly also impact latency, and therefore the “gap” between one pageable transfer and the next. This article mentions this mechanism and may also be a good resource for the general topic here.

system · May 15, 2025, 3:21pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Small Memory Transfers with CudaMemcpyAsync CUDA Programming and Performance	12	211	March 3, 2025
cudaMemcpyAsync waiting for another unrelated cudaMemcpyAsync CUDA Programming and Performance cuda	10	97	December 10, 2024
Fast copy (device->kernel) after aync kernel call CUDA Programming and Performance	11	3311	July 6, 2015
Time for Splitting up Memory Transfers? CUDA Programming and Performance	8	3391	August 14, 2010
Cudamemcpy latency exhibits significant variation CUDA Programming and Performance cuda	6	99	August 7, 2024
CPU blocked MUCH longer than expected calling a cudaMemcpy after a cuda graph launch CUDA Programming and Performance	7	607	October 19, 2023
cudaMemcpyAsync CUDA Programming and Performance	10	21274	October 16, 2015
[solved] strange cuda memcopy time CUDA Programming and Performance	5	732	March 26, 2015
Slow cudaMemcpy execution Tested in GTX480 and GT240 CUDA Programming and Performance	6	2264	April 25, 2012
cudaMemcpy is slow the first time used in a loop CUDA Programming and Performance cuda	3	1750	October 12, 2021

cudaMemcpy latency between sequencial calls

Related topics