cudaMemcpy2DAsync a lot slower than cudaMemcpy normally

Hi, I tried to use cudaMemcpy2DAsync for HostToDevice memcpy, but it made my program a lot slower somehow. I’m already using cudaMemcpy2DAsync for DeviceToHost memcpy to get the result back, and it runs just as fast as normal cudaMemcpy. But when I add the async HostToDevice part at the beginning, the whole program is a lot slower.
Basically I’m reading 2 large wide images (40960x4096 for example), and I’m “cropping” it horizontally into smaller tiles so that everytime I read and process a portion (4096x4096) of the images. “overlap” basically loads some extra data on the right side so the edge cases in the kernels are correct. The pixelMap basically has 2 values for each position, so it’s timed by 2.
I tried to time HostToDevice cudaMemcpy2DAsync and cudaMemcpy alone, and it’s like 20ms slower. Did I do something wrong? below is my cudaMemcpy code and the Async code:

    int numStreams = 3;
    cudaStream_t streams[numStreams];
    for ( int i = 0; i < numStreams; ++i )
    {
        cudaStreamCreate( &streams[i] );
    }
    int curStream = 0;

    auto pin1 = std::chrono::high_resolution_clock::now();

    cudaMemcpy( d_image1, image1.data, imageSize * sizeof( uint8_t ), cudaMemcpyHostToDevice );
    cudaMemcpy( d_image2, image2.data, imageSize * sizeof( uint8_t ), cudaMemcpyHostToDevice );
    cudaMemcpy( d_pixelMap, pixelMap, mapSize * sizeof( int ), cudaMemcpyHostToDevice);
    
    cudaDeviceSynchronize();

    auto pin2 = std::chrono::high_resolution_clock::now();

    for(int startX = 0; startX < imageWidth; startX += tileWidth )
    {
        int endX = min( startX + tileWidth, imageWidth );

        Kernel1<<< ..., streams[curStream] >>>( ..., startX, endX );

        Kernel2<<< ..., streams[curStream] >>>( ..., startX, endX );

        Kernel3<<< ..., streams[curStream] >>>( ..., startX, endX );

        ...;

        size_t widthInBytes = (endX - startX) * sizeof(uint8_t);
        cudaMemcpy2DAsync(resultImage1.data + startX, imageWidth * sizeof(uint8_t),
                          d_result1 + startX, imageWidth * sizeof(uint8_t),
                          widthInBytes, imageHeight,
                          cudaMemcpyDeviceToHost, streams[curStream]
        );

        cudaMemcpy2DAsync(resultImage2.data + startX, imageWidth * sizeof(uint8_t),
                          d_result2 + startX, imageWidth * sizeof(uint8_t),
                          widthInBytes, imageHeight,
                          cudaMemcpyDeviceToHost, streams[curStream]
        );
        
        curStream = ( curStream + 1 ) % numStreams;
    }

    for( int i = 0; i < numStreams; ++i)
    {
        cudaStreamSynchronize(streams[i]);
    }

Async HostToDevice:

    for(int startX = 0; startX < imageWidth; startX += tileWidth )
    {
        int endX = min( startX + tileWidth, imageWidth );
        int memcpyWidth = endX + overlap > imageWidth ? imageWidth - (startX + overlap) : tileWidth;
        int memcpyStart = startX + overlap;
        //if it's the first time, we start at 0 and  copy extra memory for the overlap part
        if(startX == 0)
        {
            memcpyWidth +=overlap;
            memcpyStart = 0;
        }

        cudaMemcpy2DAsync( d_image1 + memcpyStart, imageWidth * sizeof(uint8_t),
                        image1.data + memcpyStart, imageWidth * sizeof(uint8_t), 
                        memcpyWidth * sizeof(uint8_t), imageHeight, 
                        cudaMemcpyHostToDevice, streams[curStream] );
        cudaMemcpy2DAsync( d_image2 + memcpyStart, imageWidth * sizeof(uint8_t), 
                        image2.data + memcpyStart, imageWidth * sizeof(uint8_t),
                        memcpyWidth * sizeof(uint8_t), imageHeight, 
                        cudaMemcpyHostToDevice, streams[curStream] );
        cudaMemcpy2DAsync( d_pixelMap + memcpyStart * 2, imageWidth * 2 * sizeof(int),
                        pixelMap + memcpyStart * 2, imageWidth * 2 * sizeof(int),
                        memcpyWidth * 2 * sizeof( int ), imageHeight,
                        cudaMemcpyHostToDevice, streams[curStream] );

        Kernel1<<< ..., streams[curStream] >>>( ..., startX, endX );

        Kernel2<<< ..., streams[curStream] >>>( ..., startX, endX );

        Kernel3<<< ..., streams[curStream] >>>( ..., startX, endX );

        ...;

        size_t widthInBytes = (endX - startX) * sizeof(uint8_t);
        cudaMemcpy2DAsync(resultImage1.data + startX, imageWidth * sizeof(uint8_t),
                          d_result1 + startX, imageWidth * sizeof(uint8_t),
                          widthInBytes, imageHeight,
                          cudaMemcpyDeviceToHost, streams[curStream]
        );

        cudaMemcpy2DAsync(resultImage2.data + startX, imageWidth * sizeof(uint8_t),
                          d_result2 + startX, imageWidth * sizeof(uint8_t),
                          widthInBytes, imageHeight,
                          cudaMemcpyDeviceToHost, streams[curStream]
        );
        
        curStream = ( curStream + 1 ) % numStreams;
    }
    
    for( int i = 0; i < numStreams; ++i)
    {
        cudaStreamSynchronize(streams[i]);
    }

For the same amount of data moved, a cudaMemcpy2D operation will generally be slower than a cudaMemcpy operation. The exact amount will vary based on specific dimensions but it could easily be 2x slower.

A profiler (nsight systems, the GUI timeline) will likely aid your understanding of what exactly is contributing to the slowdown.

1 Like

Thank you for your reply! But for the cudaMemcpy2DAsync DeviceToHost at the end, they seem to be as fast as cudaMemcpy if I do them outside the for loop after synchronizing the streams. (for kernels+memcpy HostToDevice: 70ms in total with 2DAsync and 53ms+16ms separated). Their dimensions are basically the same as the HostToDevice ones I added. I guess I need to look into profilers to figure out the exact reasons but is there any general reasons why cudaMemcpy2DAsync HostToDevice are so much slower?

It seems for my particular test case, in the pinned and strided transfer case, the time is unusually long for the D->H transfer. I could not explain or see anything corresponding to a long H->D transfer in my test case:

# cat t249.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

const int h = 16384;
const int w = 4096;
const int s = h*w;

int main(){

  int *d, *hd;
#ifndef USE_PINNED
  hd = new int[s];
#else
  cudaHostAlloc(&hd, s*sizeof(hd[0]), cudaHostAllocDefault);
#endif
  cudaMalloc(&d, s*sizeof(d[0]));
  // monolithic h->d copy of s/1024 elements
  cudaMemcpy(d, hd, (s/1024)*sizeof(d[0]),cudaMemcpyHostToDevice);// warm-up
  unsigned long long dt = dtime_usec(0);
  cudaMemcpy(d, hd, (s/1024)*sizeof(d[0]),cudaMemcpyHostToDevice);
  dt = dtime_usec(dt);
  std::cout << "monolithic H->D: " << dt/(float)USECPSEC << "s" << std::endl;
  // monolithic d->h copy of s/1024 elements
  cudaMemcpy(hd, d, (s/1024)*sizeof(d[0]),cudaMemcpyDeviceToHost); // warm-up
  dt = dtime_usec(0);
  cudaMemcpy(hd, d, (s/1024)*sizeof(d[0]),cudaMemcpyDeviceToHost);
  dt = dtime_usec(dt);
  std::cout << "monolithic D->H: " << dt/(float)USECPSEC << "s" << std::endl;
  // strided h->d
  cudaMemcpy2DAsync(d, w*sizeof(hd[0]),  hd, w*sizeof(d[0]),  (w/1024)*sizeof(d[0]), h,cudaMemcpyHostToDevice); // warm-up
  cudaDeviceSynchronize();
  dt = dtime_usec(0);
  cudaMemcpy2DAsync(d, w*sizeof(hd[0]),  hd, w*sizeof(d[0]),  (w/1024)*sizeof(d[0]), h,cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  std::cout << "strided H->D: " << dt/(float)USECPSEC << "s" << std::endl;
  // strided d->h
  cudaMemcpy2DAsync(hd, w*sizeof(hd[0]),  d, w*sizeof(d[0]),  (w/1024)*sizeof(d[0]), h,cudaMemcpyDeviceToHost); // warm-up
  cudaDeviceSynchronize();
  dt = dtime_usec(0);
  cudaMemcpy2DAsync(hd, w*sizeof(hd[0]),  d, w*sizeof(d[0]),  (w/1024)*sizeof(d[0]), h,cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  std::cout << "strided D->H: " << dt/(float)USECPSEC << "s" << std::endl;
}

# nvcc -o t249 t249.cu
# ./t249
monolithic H->D: 6.1e-05s
monolithic D->H: 9e-05s
strided H->D: 0.000342s
strided D->H: 0.000592s
# nvcc -o t249 t249.cu -DUSE_PINNED
# ./t249
monolithic H->D: 3.9e-05s
monolithic D->H: 2.7e-05s
strided H->D: 0.000206s
strided D->H: 0.006365s
#

I generally expect pinned transfers to be faster than non-pinned, and that is true above except in the strided D->H case (which I can’t explain) and I generally expect certain strided transfers to be noticeably slower than equivalent size non-strided transfers, and that is true above.

hmm ok i see. thank you for your time and explanation!

Unless the 2D copies transfer a single contiguous block (that is, width is equal to stride) they will likely be slower than a 1D copy. A 2D copy is a convenient way of performing strided copies, but what the DMA engine needs to do under the hood is split it into #rows copies of contiguous 1D blocks across PCIe, which incurs additional overhead for each block copied.

What the 2D copy API saves versus a discrete equivalent (a programmed loop performing the row copies) is software overhead. The hardware overhead doesn’t go away.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.