concurrent D2H+H2D transfers?

Hi! Am trying to find out how to accomplish concurrent H2D + D2H transfers. This is in CUDA 7.5 (and CUDA 7.0) on a Xeon E5-2667 v3 based server with Intel C610 chipset with two TITAN X and one Tesla K40m installed. The value of cudaDeviceProp.asyncEngineCount indicates these cards have two DMA engines. Advertising material suggests that two asynchronous engines should allow simultaneous read and write transfers across the PCI Express interface and provide duplex i.e. concurrent bidirectional PCIe transfers. For example the CUDA bandwidthTest example shows separate speeds for read of ~10 GB/s and write of ~10 GB/s, so I’d expect bidirectional read+write to perform at around ~20 GB/s or somewhat less due to host memory.

According to a previous thread, H2D + D2H transfers become concurrent when they are initiated from separate streams (https://devtalk.nvidia.com/default/topic/813603/cuda-combining-h2d-and-d2h-memory-transfer-operations/)

Wrote a test test program, see below. The result was a bit surprising, also given below. My questions are:

  1. Any ideas why K40m not overlap the transfers? What is necessary in a CUDA 7.x program to make K40m do transfers concurrently?

  2. Concurrent transfers on TITAN X apparently work, but at roughly half-speed only, meaning that in total they are not much faster than serial transfers. What might be causing this? Is it a TITAN X limitation? Or has it something to do with the PCIe bus of the host…?

Results for H2D+D2H (near-)concurrent transfer on a TITAN X card:

==108594== NVPROF is profiling process 108594, command: ./membw2 0
CUDA Device #0 : GeForce GTX TITAN X, Compute Capability 5.2, data transfer concurrency, 2 DMA engines
 4 x 390.6 MiB : 0.1320 s GPU : 0.1321s CPU : 11.556 GB/s
==108594== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
3.37848s  63.582ms                    -               -         -         -         -  390.63MB  5.9996GB/s  GeForce GTX TIT         1        13  [CUDA memcpy HtoD]
3.38164s  65.698ms                    -               -         -         -         -  390.63MB  5.8064GB/s  GeForce GTX TIT         1        14  [CUDA memcpy HtoD]
3.44207s  65.696ms                    -               -         -         -         -  390.63MB  5.8066GB/s  GeForce GTX TIT         1        13  [CUDA memcpy HtoD]
3.44734s  63.147ms                    -               -         -         -         -  390.63MB  6.0410GB/s  GeForce GTX TIT         1        14  [CUDA memcpy HtoD]

When the same transfers are run serially on TITAN X:

==108516== NVPROF is profiling process 108516, command: ./membw2 2 1
CUDA Device #2 : Tesla K40m, Compute Capability 3.5, data transfer concurrency, 2 DMA engines
Changed Stream IDs to 0 to force running H2D, D2H xfers serially.
 4 x 390.6 MiB : 0.1553 s GPU : 0.1553s CPU : 9.828 GB/s
==108516== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
3.37517s  38.834ms                    -               -         -         -         -  390.63MB  9.8232GB/s   Tesla K40m (2)         1         7  [CUDA memcpy HtoD]
3.41400s  38.779ms                    -               -         -         -         -  390.63MB  9.8369GB/s   Tesla K40m (2)         1         7  [CUDA memcpy HtoD]
3.45279s  38.834ms                    -               -         -         -         -  390.63MB  9.8231GB/s   Tesla K40m (2)         1         7  [CUDA memcpy HtoD]
3.49162s  38.779ms                    -               -         -         -         -  390.63MB  9.8369GB/s   Tesla K40m (2)         1         7  [CUDA memcpy HtoD]

Concurrent transfers on K40m do not actually get carried out concurrently, at least judging from nvprof timestamps:

==109753== NVPROF is profiling process 109753, command: ./membw2 2
CUDA Device #2 : Tesla K40m, Compute Capability 3.5, data transfer concurrency, 2 DMA engines
 4 x 390.6 MiB : 0.1553 s GPU : 0.1553s CPU : 9.827 GB/s
==109753== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
3.38293s  38.837ms                    -               -         -         -         -  390.63MB  9.8223GB/s   Tesla K40m (2)         1        13  [CUDA memcpy HtoD]
3.42177s  38.782ms                    -               -         -         -         -  390.63MB  9.8362GB/s   Tesla K40m (2)         1        14  [CUDA memcpy HtoD]
3.46055s  38.837ms                    -               -         -         -         -  390.63MB  9.8224GB/s   Tesla K40m (2)         1        14  [CUDA memcpy HtoD]
3.49939s  38.782ms                    -               -         -         -         -  390.63MB  9.8362GB/s   Tesla K40m (2)         1        13  [CUDA memcpy HtoD]

(Btw, yes, nvprof says 4 times HtoD. Yet the source code has two cudaMemcpyDeviceToHost and two cudaMemcpyHostToDevice calls? Some bug in my program?)

// File: membw2.cu
// Attempts concurrent D2H+H2D memory transfer.
// $ nvprof  --profile-api-trace none --print-gpu-trace ./memb2

#include <cuda.h>
#include <helper_cuda.h>

#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/time.h>

int main(int argc, char** argv)
{
    int device = 0, run_serially = 0;
    cudaDeviceProp cuDev;
    size_t nbyte = 409600 * 1000;

    cudaEvent_t tstart, tstop;
    struct timeval tv_start, tv_stop;
    float dt_msec, dt_cpu;

    float *d_idata, *h_idata;
    float *d_odata, *h_odata;
    cudaStream_t s[2];

    if (argc > 1) {
        device = atoi(argv[1]);
    }
    if (argc > 2) {
        run_serially = atoi(argv[2]);
    }

    checkCudaErrors( cudaSetDevice(device) );
    checkCudaErrors( cudaEventCreate( &tstart ) );
    checkCudaErrors( cudaEventCreate( &tstop ) );
    checkCudaErrors( cudaGetDeviceProperties(&cuDev, device) );
    printf("CUDA Device #%d : %s, Compute Capability %d.%d, %s, %d DMA engines\n",
        device, cuDev.name, cuDev.major, cuDev.minor,
        cuDev.deviceOverlap ? "data transfer concurrency" : "no data transfer concurrency",
        cuDev.deviceOverlap ? cuDev.asyncEngineCount : 1
    );

    checkCudaErrors( cudaStreamCreate(&s[0]) );
    checkCudaErrors( cudaStreamCreate(&s[1]) );
    checkCudaErrors( cudaMalloc( (void **)&d_idata, nbyte ) );
    checkCudaErrors( cudaMalloc( (void **)&d_odata, nbyte ) );
    checkCudaErrors( cudaHostAlloc( (void **)&h_idata, nbyte, cudaHostAllocDefault ) );
    checkCudaErrors( cudaHostAlloc( (void **)&h_odata, nbyte, cudaHostAllocDefault ) );

    if (run_serially) {
        s[0] = 0; s[1] = 0;
        printf("Changed Stream IDs to 0 to force running H2D, D2H xfers serially.\n");
    }

    gettimeofday(&tv_start, NULL);
    checkCudaErrors( cudaEventRecord(tstart) );
    // H2D+D2H
    checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
    checkCudaErrors( cudaMemcpyAsync(d_odata, h_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );
    // H2D+D2H once more
    checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
    checkCudaErrors( cudaMemcpyAsync(d_odata, h_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );
    checkCudaErrors( cudaEventRecord(tstop) );
    checkCudaErrors( cudaEventSynchronize(tstop) );
    checkCudaErrors( cudaEventElapsedTime(&dt_msec, tstart, tstop) );

    gettimeofday(&tv_stop, NULL);
    dt_cpu = (tv_stop.tv_sec - tv_start.tv_sec) + 1e-6*(tv_stop.tv_usec - tv_start.tv_usec);

    double_t L_mib = nbyte/1048576.0;
    double R_total = 4 * nbyte/(dt_msec*1e-3*1024.0*1048576.0); // 4 x nbyte were copied
    printf(" 4 x %.1f MiB : %.4f s GPU : %.4fs CPU : %.3f GB/s\n",
         L_mib, dt_msec*1e-3, dt_cpu, R_total
    );

    return 0;
}

One editing mistake, the second result (TITAN X H2D, D2H non-concurrently) should have been:

==121147== NVPROF is profiling process 121147, command: ./membw2 0 1
CUDA Device #0 : GeForce GTX TITAN X, Compute Capability 5.2, data transfer concurrency, 2 DMA engines
Changed Stream IDs to 0 to force running H2D, D2H xfers serially.
 4 x 390.6 MiB : 0.1317 s GPU : 0.1317s CPU : 11.589 GB/s
==121147== Profiling application: ./membw2 0 1
==121147== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
3.37377s  32.910ms                    -               -         -         -         -  390.63MB  11.591GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
3.40669s  32.909ms                    -               -         -         -         -  390.63MB  11.592GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
3.43960s  32.909ms                    -               -         -         -         -  390.63MB  11.592GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
3.47251s  32.909ms                    -               -         -         -         -  390.63MB  11.592GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]

The outcome is anyway the same, TITAN X concurrent transfer (~11.6 GB/s total as in post #1) is roughly equally as fast as non-concurrent transfers (11.6 GB/s total as above). Now, how to achieve ~20 GB/s concurrent…?

Your system memory should be fast enough to deal with PCIe gen3 full duplex speeds. I don’t see any obvious bugs in your code, but then I am generally bad at spotting mistakes in other people’s code just by reading it. Some thoughts:

(1) What operating system is running?

(2) How are the three GPUs physically mapped to the available PCIe lanes? Is the link you are testing actually operating as PCIe gen3 x16 while the app is running? nvidia-smi should show the link state.

(3) Is this a dual-socket system? If so make sure to control CPU and memory affinity such that the GPU is communicating with the “near” CPU and its the attached “near” memory

(4) I would suggest to run such benchmarks without the profiler, to avoid possible interference from profiler hooks

(5) It is customary for bandwidth tests to report the fastest transfer from among ten measurements (see STREAM benchmark methodology). This smoothes out performance variations that are normal in memory sub-systems (akin to thermal noise).

(6) Instead of just measuring the speed across a pair of transfers, I would suggest setting up a loop that gets the transfers into a “steady state” and measure the throughput doing that. I would use 100 transfers total for starters. This approach eliminates the impact of start-up overheads.

(7) Prior to start of measurements ensure the GPU is idle, e.g. by call to cudaDeviceSynchronize().

You have defects in your code. Given that you appear to be doing error checking, I suspect a problem with your error checking method. The defects are as follows:

checkCudaErrors( cudaMalloc( (void **)&d_idata, nbyte ) );
    checkCudaErrors( cudaMalloc( (void **)&d_odata, nbyte ) );

as we can see, the pointers d_idata and d_odata are device pointers, likewise your corresponding h_ pointers are host pointers. That is all well and good, no problem yet.

// H2D+D2H
    checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
*** checkCudaErrors( cudaMemcpyAsync(d_odata, h_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );
                                       ^         ^
                                       |         supposed to be a device pointer
                                       supposed to be a host pointer
    // H2D+D2H once more
    checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
*** checkCudaErrors( cudaMemcpyAsync(d_odata, h_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );

in terms of order of the first two parameters, cudaMemcpyAsync is just like cudaMemcpy which is just like memcpy: the first parameter is the destination pointer and the second parameter is the source pointer. For cudaMemcpy operations, the order of these parameters must match the stated direction. But in the (***) cases above, they do not.

CUDA should throw a runtime error in that scenario. The fact that you are not getting a runtime error (or at least haven’t mentioned it) leads me to believe there is a flaw in your error checking process, or else this is not the code you are running.

Thanks both for your replies! Sorry to check back so late. This is under 64-bit CentOS 7. The output of nvidia-smi lists “PCIe Generation : Current: 3” and gives a “Link Width : Current : 16x”. Good catch with me messing up the pointers in cudaMemcpyAsync!! Interestingly, the wrong and the correct calls,

cudaError_t rc1, rc2,
cudaMalloc((void **)&d_odata, nbyte);
cudaHostAlloc((void **)&h_odata, nbyte, cudaHostAllocDefault);
rc1 = cudaMemcpyAsync(d_odata, h_odata, nbyte, cudaMemcpyDeviceToHost, s[1]); // wrong args
rc2 = cudaMemcpyAsync(h_odata, d_odata, nbyte, cudaMemcpyDeviceToHost, s[1]); // correct args
printf("cudaMemcpyAsync(devptr=%016p,   hptr=%016p, dir=d->h) : %d\n", d_odata, h_odata, rc1);
printf("cudaMemcpyAsync(hptr  =%016p, devptr=%016p, dir=d->h) : %d\n", h_odata, d_odata, rc2);

both return a cudaSuccess, the printout is

cudaMemcpyAsync(devptr=0x0000231f320000,   hptr=0x0000021cba0000, dir=d->h) : 0
cudaMemcpyAsync(hptr  =0x0000021cba0000, devptr=0x0000231f320000, dir=d->h) : 0

And in nvprof (I’ve removed some columns)

==21674== Profiling result:
   Start  Duration         Size  Throughput           Device   Context    Stream  Name
2.80189s  22.830us     390.63MB   2e+04GB/s  GeForce GTX TIT         1        14  [CUDA memcpy HtoD]
2.80192s  6.0750us     390.63MB   6e+04GB/s  GeForce GTX TIT         1        14  [CUDA memcpy DtoH]

From that it seems cudaMemcpyAsync() in CUDA 7.5 might ignore the direction argument and determine the direction using instead the passed pointers. On the other hand, the documentation says “pointers that do not match the direction of the copy results in an undefined behavior”.

Lastly I fixed pointers in the original benchmark code according to what txbob indicated, and implemented most of njuffa’s remarks. Re-ran the benchmark. We had unfortunately changed the server since the original benchmark run so results are not directly comparable. On the new server (the 1U-high Supermicro 1028GQ with four TITAN X cards) the system architecture is a bit different. Anyway, with wrong pointer order in cudaMemcpyAsync() the mean throughput is similar to before, and much better when the correct pointer order is used:

serial       : 2 x 390.6 MiB : 0.0639 s GPU : 0.0639s CPU : 11.936 GB/s
wrong ptrs   : 2 x 390.6 MiB : 0.0660 s GPU : 0.0660s CPU : 11.560 GB/s
correct ptrs : 2 x 390.6 MiB : 0.0370 s GPU : 0.0370s CPU : 20.639 GB/s

Thanks for your help!

(Perhaps worth to provide the corrected and updated code here…)

// File: membw2.cu
// Attempts concurrent D2H+H2D memory transfer.
// $ nvprof  --profile-api-trace none --print-gpu-trace ./membw2  <device> <0=parallel|1=serial>

#include <cuda.h>
#include <helper_cuda.h>

#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/time.h>

int main(int argc, char** argv)
{
    int device = 0, run_serially = 0, max_iter = 100;
    cudaDeviceProp cuDev;
    size_t nbyte = 409600 * 1000;

    cudaEvent_t tstart, tstop;
    cudaEvent_t tstart_sub, tstop_sub;
    struct timeval tv_start, tv_stop;
    float dt_msec, dt_msec_sub = 1e7, dt_cpu;

    float *d_idata, *h_idata;
    float *d_odata, *h_odata;
    cudaStream_t s[2];

    if (argc > 1) {
        device = atoi(argv[1]);
    }
    if (argc > 2) {
        run_serially = atoi(argv[2]);
    }

    checkCudaErrors( cudaSetDevice(device) );
    checkCudaErrors( cudaEventCreate( &tstart ) );
    checkCudaErrors( cudaEventCreate( &tstop ) );
    checkCudaErrors( cudaEventCreate( &tstart_sub ) );
    checkCudaErrors( cudaEventCreate( &tstop_sub ) );
    checkCudaErrors( cudaGetDeviceProperties(&cuDev, device) );
    printf("CUDA Device #%d : %s, Compute Capability %d.%d, %s, %d DMA engines\n",
        device, cuDev.name, cuDev.major, cuDev.minor,
        cuDev.deviceOverlap ? "data transfer concurrency" : "no data transfer concurrency",
        cuDev.deviceOverlap ? cuDev.asyncEngineCount : 1
    );

    checkCudaErrors( cudaStreamCreate(&s[0]) );
    checkCudaErrors( cudaStreamCreate(&s[1]) );
    checkCudaErrors( cudaMalloc( (void **)&d_idata, nbyte ) );
    checkCudaErrors( cudaMalloc( (void **)&d_odata, nbyte ) );
    checkCudaErrors( cudaHostAlloc( (void **)&h_idata, nbyte, cudaHostAllocDefault ) ); // host->dev
    checkCudaErrors( cudaHostAlloc( (void **)&h_odata, nbyte, cudaHostAllocDefault ) ); // dev->host

    double_t L_mib = nbyte/1048576.0;

    if (run_serially) {
        s[0] = 0; s[1] = 0;
        printf("Changed Stream IDs to 0 to force running H2D, D2H xfers serially.\n");
    }

    gettimeofday(&tv_start, NULL);
    checkCudaErrors( cudaDeviceSynchronize() );
    checkCudaErrors( cudaEventRecord(tstart) );

    for (int i = 0; i < max_iter; i+=2) {
        float dt_msec_tmp;
        // We make four calls for transfers here so that the code can be easily edited
        // and the order/selection of CUDA streams can be changed!
        // H2D+D2H : first time
        checkCudaErrors( cudaEventRecord(tstart_sub) );
        checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
        checkCudaErrors( cudaMemcpyAsync(h_odata, d_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );
        // H2D+D2H : second time
        checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
        checkCudaErrors( cudaMemcpyAsync(h_odata, d_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );
        checkCudaErrors( cudaEventRecord(tstop_sub) );
        checkCudaErrors( cudaEventSynchronize(tstop_sub) );
        checkCudaErrors( cudaEventElapsedTime(&dt_msec_tmp, tstart_sub, tstop_sub) );
        dt_msec_tmp /= 2.0f;
        if (dt_msec_tmp < dt_msec_sub) {
            dt_msec_sub = dt_msec_tmp;
        }
        printf(
            "%d/%d : %.4f ms : best so far %.4f ms per transfer pair = %.3f GB/s\n",
            i, max_iter, dt_msec_tmp, dt_msec_sub, 2*nbyte/(dt_msec_sub*1e-3*1024.0*1048576.0)
        );
    }

    checkCudaErrors( cudaEventRecord(tstop) );
    checkCudaErrors( cudaEventSynchronize(tstop) );
    checkCudaErrors( cudaEventElapsedTime(&dt_msec, tstart, tstop) );
    dt_msec /= max_iter;

    gettimeofday(&tv_stop, NULL);
    dt_cpu = (tv_stop.tv_sec - tv_start.tv_sec) + 1e-6*(tv_stop.tv_usec - tv_start.tv_usec);
    dt_cpu /= max_iter;

    double R_total = 2*nbyte/(dt_msec*1e-3*1024.0*1048576.0); // 2 x nbyte were copied per H2D+D2H pair
    printf(" 2 x %.1f MiB : %.4f s GPU : %.4fs CPU : %.3f GB/s\n",
         L_mib, dt_msec*1e-3, dt_cpu, R_total
    );

    return 0;
}