cudaMemcpy latency unusually high on some machines

Hello,

I have been noticing that the latency associated with cudaMemcpy calls can be very different from one machine to another. To investigate this, I created the following example which evolves 2D Heat diffusion on a square grid of given edge size, and for a given number of steps. It only has one simple kernel, and optionally I can introduce a (synchronous) cudaMemcpy from device to host and/or from host to device at each time step.

#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <omp.h>
#include <windows.h>

#ifdef DP
    #define REAL double
#else
    #define REAL float
#endif

__global__ void avg1(
    const int nEdge,
    const REAL *const yin,
    REAL *const yout)
{
    const int i = blockIdx.x*blockDim.x + threadIdx.x;
    const int j = blockIdx.y*blockDim.y + threadIdx.y;

    if(   i > 0 && i < nEdge - 1
       && j > 0 && j < nEdge - 1)
    {
        const int icc = i*nEdge + j;

        const REAL yn1 = yin[icc - nEdge];
        const REAL yn2 = yin[icc + nEdge];
        const REAL yn3 = yin[icc - 1];
        const REAL yn4 = yin[icc + 1];

        yout[icc] = (yn1 + yn2 + yn3 + yn4)/4.;
    }
}

__global__ void kernelCpy(
    REAL *const dest,
    REAL *const src,
    const int n)
{
    const int i = blockIdx.x*blockDim.x + threadIdx.x;

    if(i < n)
        dest[i] = src[i];
}

void swap(
    REAL*& d_y1,
    REAL*& d_y2)
{
    REAL *const tmp = d_y1;
    d_y1 = d_y2;
    d_y2 = tmp;
}

void printUsage()
{
    printf("Usage: GPUPerformanceXP.exe peerToPeerType memCpy nEdge nSteps devId0 devId1 ...\n");
    printf("       peerToPeerType 0 = data transfer via CPU\n");
    printf("       peerToPeerType 1 = data transfer using cudaMemcpyDeviceToDevice\n");
    printf("       peerToPeerType 2 = data transfer using kernels where peerToPeer access is enabled\n");
    printf("       memCpy 0 = no additional memory copies to CPU-GPU\n");
    printf("       memCpy 1 = one cudaMemcpyDeviceToHost per step\n");
    printf("       memCpy 0 = one cudaMemcpyHostToDevice per step\n");
    printf("       memCpy 3 = both 1 and 2\n");
}

#define MAXDEV 16

int main(
    const int argc,
    const char *const argv[])
{
    int   peerToPeerType;
    int   memCopy;
    int   nEdge;
    int   nSteps;
    int   nDevices;
    int   devIds[MAXDEV];
    REAL* y[MAXDEV];
    REAL* d_y1[MAXDEV];
    REAL* d_y2[MAXDEV];
//    ​cudaError_t err;
    dim3 nThreadsPerBlock1D(32);
    dim3 nBlocks1D;
    dim3 nThreadsPerBlock2D(16, 16);
    dim3 nBlocks2D;

    SYSTEMTIME sysTime1, sysTime2;

    nDevices = argc - 5;
    
    if(nDevices < 1)
    {
        printUsage();
        return 0;
    }

    if(nDevices > MAXDEV)
    {
        printf("ERROR: Currently this test only supports up to %i devices\n", MAXDEV);
        return 0;
    }

    if(sscanf(argv[1], "%i", &peerToPeerType) != 1)
    {
        printf("ERROR: Expecting peerToPeerType to be an integer\n");
        return 0;
    }

    if(sscanf(argv[2], "%i", &memCopy) != 1)
    {
        printf("ERROR: Expecting memCopy to be an integer\n");
        return 0;
    }

    if(sscanf(argv[3], "%i", &nEdge) != 1)
    {
        printf("ERROR: Expecting nEdge to be an integer\n");
        return 0;
    }

    if(sscanf(argv[4], "%i", &nSteps) != 1)
    {
        printf("ERROR: Expecting nSteps to be an integer\n");
        return 0;
    }

    for(int i = 0; i < nDevices; i++)
        if(sscanf(argv[i + 5], "%i", devIds + i) != 1)
        {
            printf("ERROR: Expecting devIds to be integers\n");
            return 0;
        }

    if(nEdge < 16 || nEdge > 65536)
    {
        printf("ERROR: Expecting nEdge to be in range 16 ... 65536\n");
        return 0;
    }

    if(nSteps < 1)
    {
        printf("ERROR: Expecting nSteps to be > 0\n");
        return 0;
    }

    nBlocks1D.x = (nEdge + nThreadsPerBlock1D.x - 1)/nThreadsPerBlock1D.x;

    nBlocks2D.x = (nEdge + nThreadsPerBlock2D.x - 1)/nThreadsPerBlock2D.x;
    nBlocks2D.y = (nEdge + nThreadsPerBlock2D.y - 1)/nThreadsPerBlock2D.y;

    {
        int count;
        if(cudaGetDeviceCount(&count) != cudaSuccess)
        {
            printf("ERROR: Unable to determine number of cuda devices - check GPU driver\n");
            return 0;
        }

        printf("%i cuda devices found\n", count);

        for(int i = 0; i < nDevices; i++)
        {
            if(devIds[i] < 0 || devIds[i] >= count)
            {
                printf("ERROR: devId[%i] must be in range 0 ... %i\n", i, count - 1);
                return 0;
            }
        }
    }

    int* peerToPeerAccess = new int[nDevices*nDevices];
    for(int i = 0; i < nDevices*nDevices; i++)
        peerToPeerAccess[i] = 0;

    if(peerToPeerType > 0)
    {
        for(int i = 0; i < nDevices; i++)
        {
            int res;

            if(i > 0)
            {
                const int j = i - 1;
                int canAccessPeer;

                res = cudaDeviceCanAccessPeer(&canAccessPeer, devIds[i], devIds[j]);
                if(res != cudaSuccess)
                    printf("cudaDeviceCanAccessPeer %i to %i failed with return code %i\n", res, devIds[i], devIds[j]);

                if(canAccessPeer)
                {
                    printf("enabling peer to peer access dev %i to dev %i\n", devIds[j], devIds[j]);
                    cudaSetDevice(devIds[i]);
                    res = cudaDeviceEnablePeerAccess(devIds[j], 0);
                    if(res == cudaSuccess)
                        peerToPeerAccess[i*nDevices + j] = canAccessPeer;
                    else
                        printf("cudaDeviceEnablePeerAccess failed with return code %i\n", res);
                }
                else
                    printf("peer to peer access dev %i to dev %i not possible\n", devIds[i], devIds[j]);
            }

            if(i < nDevices - 1)
            {
                const int j = i + 1;
                int canAccessPeer;

                res = cudaDeviceCanAccessPeer(&canAccessPeer, devIds[i], devIds[j]);
                if(res != cudaSuccess)
                    printf("cudaDeviceCanAccessPeer %i to %i failed with return code %i\n", res, devIds[i], devIds[j]);

                if(canAccessPeer)
                {
                    printf("enabling peer to peer access dev %i to dev %i\n", devIds[i], devIds[j]);
                    cudaSetDevice(devIds[i]);
                    res = cudaDeviceEnablePeerAccess(devIds[j], 0);
                    if(res == cudaSuccess)
                        peerToPeerAccess[i*nDevices + j] = canAccessPeer;
                    else
                        printf("cudaDeviceEnablePeerAccess failed with return code %i\n", res);
                }
                else
                    printf("peer to peer access dev %i to dev %i not possible\n", devIds[i], devIds[j]);
            }
        }
    }

    printf("peerToPeerAccess:\n");
    for(int i = 0; i < nDevices; i++)
    {
        printf("%i", peerToPeerAccess[i*nDevices + 0]);
        for(int j = 1; j < nDevices; j++)
            printf(" %i", peerToPeerAccess[i*nDevices + j]);
        printf("\n");
    }
    
    for(int tid = 0; tid < nDevices; tid++)
    {
        if(cudaSetDevice(devIds[tid]) != cudaSuccess)
        {
            printf("ERROR: Unable to setlect device %i\n", devIds[tid]);
            return 0;
        }

        cudaMallocHost(y + tid, nEdge*nEdge*sizeof(REAL)); 

        cudaMalloc(d_y1 + tid, nEdge*nEdge*sizeof(REAL));
        cudaMalloc(d_y2 + tid, nEdge*nEdge*sizeof(REAL));
        if(d_y1 == NULL || d_y2 == NULL)
        {
            printf("ERROR: Unable to allocate memory on device %i\n", devIds[tid]);
            return 0;
        }
    }

    omp_set_num_threads(nDevices);

    #pragma omp parallel
    {
        const int tid = omp_get_thread_num();

        cudaSetDevice(devIds[tid]);

        printf("Initialising host memory for thread %i\n", tid);
        memset(y[tid], 0, nEdge*nEdge*sizeof(REAL));
        
        if(tid == nDevices - 1)
            for(int j = 0; j < nEdge; j++)
                y[tid][(nEdge - 1)*nEdge + j] = 1.0;

        printf("Copying data to GPU %i\n", devIds[tid]);
        cudaMemcpy(d_y2[tid], y[tid], nEdge*nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
        cudaMemcpy(d_y1[tid], y[tid], nEdge*nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
    }

    GetSystemTime(&sysTime1);

    #pragma omp parallel
    {
        const int tid = omp_get_thread_num();

        for(int iStep = 0; iStep < nSteps; iStep++)
        {
            avg1<<<nBlocks2D, nThreadsPerBlock2D>>>(nEdge, d_y1[tid], d_y2[tid]);
            cudaStreamSynchronize(NULL);

            #pragma omp barrier

            switch(peerToPeerType)
            {
                case 0:
                    // copy via host RAM
                    if(tid < nDevices - 1)
                        cudaMemcpy(y[tid] + (nEdge - 2)*nEdge, d_y2[tid] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToHost);
                    if(tid > 0)
                        cudaMemcpy(y[tid] +          1 *nEdge, d_y2[tid] +          1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToHost);

                    #pragma omp barrier

                    if(tid < nDevices - 1)
                        cudaMemcpy(d_y2[tid] + (nEdge - 1)*nEdge, y[tid + 1] +          1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
                    if(tid > 0)
                        cudaMemcpy(d_y2[tid] +          0 *nEdge, y[tid - 1] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
                    break;
                case 1:
                    // cudaMemcpyDeviceToDevice
                    if(tid < nDevices - 1)
                        cudaMemcpy(d_y2[tid] + (nEdge - 1)*nEdge, d_y2[tid + 1] +          1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
                    if(tid > 0)
                        cudaMemcpy(d_y2[tid] +          0 *nEdge, d_y2[tid - 1] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
                    break;
                case 2:
                    // use kernel to copy
                    if(tid < nDevices - 1)
                    {
                        if(peerToPeerAccess[tid*nDevices + tid + 1])
                            kernelCpy<<<nBlocks1D, nThreadsPerBlock1D>>>(d_y2[tid] + (nEdge - 1)*nEdge, d_y2[tid + 1] +          1 *nEdge, nEdge);
                        else
                            cudaMemcpy(d_y2[tid] + (nEdge - 1)*nEdge, d_y2[tid + 1] +          1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
                    }
                    if(tid > 0)
                    {
                        if(peerToPeerAccess[tid*nDevices + tid - 1])
                            kernelCpy<<<nBlocks1D, nThreadsPerBlock1D>>>(d_y2[tid] +          0 *nEdge, d_y2[tid - 1] + (nEdge - 2)*nEdge, nEdge);
                        else
                            cudaMemcpy(d_y2[tid] +          0 *nEdge, d_y2[tid - 1] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
                    }
                    break;
                default:
                    break;
            }

            if((memCopy & 0x00000001) > 0)
                cudaMemcpy(   y[tid], d_y2[tid], sizeof(REAL), cudaMemcpyDeviceToHost);
            if((memCopy & 0x00000002) > 0)
                cudaMemcpy(d_y1[tid],    y[tid], sizeof(REAL), cudaMemcpyHostToDevice);

            swap(d_y1[tid], d_y2[tid]);
        }

        cudaMemcpy(y[tid], d_y1[tid], nEdge*nEdge*sizeof(REAL), cudaMemcpyDeviceToHost);
    }

    GetSystemTime(&sysTime2);
    
    {
        int wallTime = 3600000*(int)(sysTime2.wHour         - sysTime1.wHour);
        wallTime    +=   60000*(int)(sysTime2.wMinute       - sysTime1.wMinute);
        wallTime    +=    1000*(int)(sysTime2.wSecond       - sysTime1.wSecond);
        wallTime    +=         (int)(sysTime2.wMilliseconds - sysTime1.wMilliseconds);

        printf("Wall time = %i ms\n", wallTime);
    }

    printf("\n");
    for(int tid = 0; tid < nDevices; tid++)
    {
        cudaSetDevice(devIds[tid]);

        cudaFree(d_y1[tid]);
        cudaFree(d_y2[tid]);

        cudaFreeHost(y[tid]);
    }

    return 0;
}

I run the model with different size grids, and measuring wall time. Plotting time vs number of cells produces a near linear plot, where the y-intercept can be considered the latency associated with kernel launches. When adding in the synchronous cudaMemcpys, the y-intercept increases as expected, with the delta being the latency associated with the copy. Note that the copy is only moving one 4-byte float, so bandwidth is unimportant - the code is only testing the latency.

Now the interesting thing is when I run this test on some different machines and determine the delta associated with introducing the cudaMemcpys, I get:


The best results are from the two linux machines. The 6 Windows machines are not so good, probably because of the WDDM driver model (and I can’t put those cords into TCC). But two of the Windows machines are showing latencies that are 10x that of the Linux machines, and 3x worse than some of the other Windows machines. This is hurting me for small models and multi-GPU models with halo exchanges over PCIe bus. What factors or settings could be causing this?

Greg.

In all these systems, is the GPU attached via a PCIe gen 4 x16 link? Older PCI versions and narrower interfaces will have a negative performance impact.

If any of these machines have dual CPU sockets, or use a CPU made up from multiple chiplets internally, make sure to configure processor and memory affinity such that each GPU communicates with the “near” CPU cores and their attached “near” system memory. You would use a tool like numactl to do so.

Having to route memory traffic between GPU and system memory through an inter-socket or inter-chiplet communication path can have significant negative impact on transfer latency and transfer throughput. The “Threadrippers” on your list jump out as processors comprising multiple internal chiplets. You can look up the CPUs in their respective manufacturer databases or the third-party TechPowerUp database.

Copy performance between the host’s system memory and the GPU’s memory could be impacted by low system memory bandwidth. This is less likely on modern systems with DDR4 memory and multiple memory channels, but worth a sanity check, in particular if the system memory is also serving a high traffic load from software running on the host (that could be your application, or some other application), or traffic from other GPUs, or high-speed network controllers.

System memory should be constructed from the highest speed grade of DDR4 or DDR5 memory that the system supports, and all available memory channels should be populated. Typically there would be between 4 and 12 memory channels per socket in an HPC system, depending on processor generation and whether it was designed as a workstation or a server platform.

Ideally you should be using cudaMemcpyAsync, as that involves simple DMA transfers between pinned host memory and GPU memory. By contrast, cudaMemcpy involves a pinned host memory buffer maintained by the CUDA driver, with DMA transfers between a GPU and this pinned buffer, and system memory copies between the buffer and your applocation’s data. This drives up system memory bandwidth usage (two transfers instead of one) and adds latency.

Thanks @njuffa for your response, I will dig a little more into PCIe config, but I can confirm that the machines are all single socket CPU, and were not running other jobs at the time of testing. I also repeat that the data in the table is the y-intercept when plotting execution time vs model size (plot below), so it really is just the latencies associated with the cudaMemcpy operations. The volume of data for the copies was 1 float. I assume this would be residing in cache, so I am not sure the RAM config would be having much effect. Also the two worst machines in the list are the newest. They were not cheap. I’ll confirm the PCI tomorrow, but I expect it is gen 5 and lots of lanes - but again it is latencies that are hurting, not bandwidth. I was hoping Nvidia might provide some form of low-level diagnostic tool that we can use to profile the system and give a performance report.


Greg.

when people are fiddling with WDDM performance, I often suggest to try both settings of Hardware Accelerated GPU Scheduling, see here for a recent example. I don’t know if it would have any effect here.

The low level performance tool I would suggest to possibly get more understanding would be nsight systems.

Understood, but the two are not entire uncorrelated. A higher throughput pipe accomplishes the transfer of a particular amount of data in less time, and this increase in latency will be particularly noticeable for large transfers. Conversely, high latency will reduce throughput when there are many small transfers.

Those would appear to be the Ryzen Threadripper PRO 5955WX and the AMD Ryzen 9 7950X? These are CPUs constructed from chiplets (which 8 cores each, I think), incur communication overhead for data transfer between these, and therefore have (usually mild) NUMA characteristics. They are like multi-socket systems within one physical package. I would suggest trying the Windows equivalent of numactl to configure processor and memory affinity, such that inter-chiplet communication is avoided for transfers between system memory and GPU.

Thankyou both for your help. We’ve done some more profiling, with some of the machines in HAGS off and HAGS on, and have found:


Turing HAGS on appears to significantly reduce the latency, but even still these results are still well behind what is achievable on Linux systems. Other than HAGS and OS, there is no clear factor. With the GPUs becoming so powerful, latencies that were almost invisible in the past are becoming very visible now. Food for thought with regard to code design.

Thanks for your feedback. Greg.

@Robert_Crovella @njuffa I’ve just run the same profiling on the bmt-ws2400 machine with Ubuntu installed, and the latencies are now down to around 90ms for 10,000 steps of there-and-back copy. So it looks the Threadripper architecture is not that much of an issue. It appears to be something about Windows (and/or Nvidia’s windows GPU driver) and how that interacts with the architecture. I would be keen to hear from Nvidia’s driver developers as to what might be causing this variability in latency on Windows machines.

Robert Crovella may be able to provide deeper insights. From a regular user perspective I would say the best way to “hear from NVIDIA’s driver developers” is to file a bug, pointing out the significant performance differential.

My cynical perspective as a user of both Linux and Windows is that there is likely little left to be done by the driver engineers. The WDDM driver model is designed to put the OS firmly in charge of the GPU to ensure stability of the GUI; the NVIDIA driver has to actively work around that to mitigate negative performance impacts this has. It seems unlikely that NVIDIA’s driver engineers have not maxed out possible workarounds over the past dozen years of WDDM’s existence.

NVIDIA’s TCC driver avoids these issues and in my observation delivers performance on Windows platforms that is similar to Linux platforms. It is, however, restricted to certain GPUs that one might want to collectively call the professional line. This is likely a marketing issue rather than an engineering issue.

I may be wrong in my understanding of the market, but as far as I know HPC activities of all kinds are largely focused on Linux platforms, and presumably this is where NVIDIA therefore also concentrates its HPC-related efforts.

1 Like

With a TCC or Linux driver model, the CUDA compute subsystem can talk more-or-less directly to the GPU.

CUDA --> GPU Driver

With a WDDM driver model, things are different. The CUDA compute system, like every other consumer of a windows GPU, must talk to a Microsoft API (WDDM).

CUDA --> WDDM --> GPU Driver

This means WDDM is involved in the work issuance to the device, and there are other ramifications such as WDDM manages GPU memory via a VM system unique to WDDM. WDDM is documented by Microsoft.

So, among other differences, it seems fairly self-evident to me that latencies could not possibly be “the same”. There are other differences as well, such as in memory management, work issuance order, and perhaps others. You can find forum threads on nearly all these topics, with a bit of searching.

While its certainly not an exhaustive description or list of differences, this TCC page points out that:

The driver … reduces the CUDA kernel launch overhead on Windows.

To get the best CUDA performance on windows, the TCC driver is recommended, with all that that implies. The TCC driver is not supported on every CUDA GPU however. If you want best CUDA performance on windows, choose a GPU that can be placed into TCC mode. If you must be in a WDDM environment, there are inevitable architectural considerations that prevent the WDDM driver from achieving the same performance as the TCC driver.

I recognize that probably your aim with this exercise is to measure or highlight latency. I’d like to agree with a statement you made already in this regard:

A larger GPU generally requires more (exposed, parallel) work to achieve the same level of efficiency (delivered throughput divided by peak theoretical throughput). This is not new; it has been true since day 1 of CUDA. Newer GPUs are not perfectly correlated with “larger” but there is some correlation there; newer GPUs on average are getting larger. So from a code design perspective, it is a good practice to expose enough or as much work as possible. If we look at the number you are posting, such as 54 milliseconds for 10,000 loops, we can quickly deduce that each loop contributes about 5 microseconds to the total. A single kernel launch, even on linux, could easily have a 5 millisecond overhead or launch latency. Therefore we could surmise that the actual work you are issuing, per loop, is nearly zero for this test, or at any rate is not sufficient to be visible against a nominal latency expectation. This is a tiny amount of work, and generally would never have been a good idea on a GPU, for the reasons discussed here: latency becomes a dominating factor in performance.

So it is still the case, and has always been the case, that you must issue enough work to a GPU to make the latency be a small overall factor, or your performance will probably be disappointing. It’s true on linux, and its true on windows. With the case of WDDM, the actual bar is at different place, because of WDDM effects.

For repetitive workloads, where latency is a significant or dominating factor, sometimes CUDA graphs are suggested; I don’t know if it would be applicable or useful, and probably not specifically for cudaMemcpy for the H<->D copies. At the moment, all your work is launched into a null stream. If you create a stream, and pin the host memory, you should be able to issue those operations as cudaMemcpyAsync which would be graph-able. A possible first step might be to consider investigating the effect of getting everything stream-able.

I’m not certain stream or graph refactoring would help; I haven’t studied your code in enough detail to reach that conclusion.

@Robert_Crovella Thanks for another great reply. I did know that the WDDM model had greater latencies than the TCC or Linux driver. but i had been surprised that with some CPU architectures (and HAGS off) that they could be 5-10x as much. Even within the results for WDDM, the range in latencies is quite large (in part due to some machines having HAGS on and some having HAGS off), hence the title for this post.

The results posted above have been from a testing code and method designed to measure latency only and remove compute performance and transfer bandwidth from the results.

When computing the solutions to PDEs on grids or arbitrary meshes, it is essential to perform halo-transfers between GPUs. When the halo transfers involve multiple components of data per cell, and the computation of each component in each cell involves all of the other components in that and neighboring cells, then the halo copies become pretty mutually exclusive with compute - there is no way to hide the latencies by overlapping them with kernel work.

With cudaMallocManaged it is possible to leave the memory transfers completely up to the driver - only moving blocks of memory as and when needed, and hopefully hiding the transfer time by swapping between warps on the SMs when threads are waiting for memory to arrive. This works very well on Linux, but last time I tested it on Windows I ran into problems with it.

I think we are just going to have to bite the bullet and produce a Linux build of our software for our power users.