Slow cudaMemcpy execution Tested in GTX480 and GT240

I’ve tried the bandwidth test from the SDK. GTX480 gives about 5200MB/s while GT240 gives about 5400 MB/s (pageable).

This is fine.

The problem came when I need to do cudaMemcpy in a loop. The performance degrades rapidly until after the first few loops, it reaches around 300 MB/s after a while.

I do need to have it in loops as I’m doing signal processing for very huge chunks of data.

I’ve tried with pinned memory, it shows similar issue as well. Plus pinned memory has size limitation, so it is not quite suitable for my purpose.

Here’s a simple program that I specific wrote to demonstrate this issue:

#include <string.h>

#include <stdio.h>

#include <sys/time.h>

float elapsed_time(struct timeval *start);

int main()

{

  int i;

  int nloop = 16;

  struct timeval thetime;

  float copyTime = 0;

int size = 256 * 1024 * 1024;

unsigned char *data;

  unsigned char *cudaData;

data = (unsigned char *)malloc( size * sizeof(unsigned char) );

  //cudaMallocHost( (void **)&data, size * sizeof(unsigned char) );

  for( i = 0; i < size; i++ )

  {

    data[i] = (unsigned char)((i+3) % 256);

  }

cudaMalloc( (void **)&cudaData, size * sizeof(unsigned char) );

printf( "Number of loops: %d\n", nloop );

  for( i = 0; i < nloop; i++ )

  {

    gettimeofday(&thetime, NULL);

    cudaMemcpy( &cudaData[i*size/nloop], &data[i*size/nloop], size / nloop * sizeof(unsigned char), cudaMemcpyHostToDevice );

    copyTime += elapsed_time(&thetime);

    printf( "Memory size %d MB, copy time: %f (ms)\n", 

	size/ nloop / 1024 / 1024, copyTime );

  }

cudaFree(cudaData);

  free(data);

  //cudaFreeHost(data);

return 0;

}

/* returns the elapsed wall-clock time, in ms, since start (without resetting start) */

float elapsed_time(struct timeval *start){

    struct timeval now;

    gettimeofday(&now,NULL);

    return 1.e3f*(float)(now.tv_sec-start->tv_sec) +

        1.e-3f*(float)(now.tv_usec-start->tv_usec);

}

The output I’m getting are:

Number of loops: 16

Memory size 16 MB, copy time: 3.124000 (ms)

Memory size 16 MB, copy time: 6.202000 (ms)

Memory size 16 MB, copy time: 9.216001 (ms)

Memory size 16 MB, copy time: 12.234001 (ms)

Memory size 16 MB, copy time: 15.231001 (ms)

Memory size 16 MB, copy time: 18.229000 (ms)

Memory size 16 MB, copy time: 21.246000 (ms)

Memory size 16 MB, copy time: 24.303001 (ms)

Memory size 16 MB, copy time: 24.381001 (ms)

Memory size 16 MB, copy time: 24.387001 (ms)

Memory size 16 MB, copy time: 24.392000 (ms)

Memory size 16 MB, copy time: 24.396999 (ms)

Memory size 16 MB, copy time: 24.400999 (ms)

Memory size 16 MB, copy time: 24.404999 (ms)

Memory size 16 MB, copy time: 24.408998 (ms)

Memory size 16 MB, copy time: 24.413998 (ms)

Is this normal? How can it be improved?

Did you try with pinned memory on the host?

Yes, I’ve mentioned it in my original post. Similar results, just that the overall speed is a bit quicker.

I’ve even left the cudaMallocHost that I’ve tried commented out in the code.

Hi,

I might be wrong, but I guess that the line

copyTime += elapsed_time(&thetime);

means that you actually print the time taken for the cumulated copy time. What you would rather look at corresponds to

copyTime = elapsed_time(&thetime);

Then, I should display an about solid 5200MB/s bandwidth, up to 128MB, and then some very suspicious times probably corresponding to a failure in the data transfer…

You’re right, damn bug. The issue doesn’t actually get demonstrated in this example.

Thanks anyway.

I can post up part of the code which has this problem.

int unpackDigitisedDataToGPU(int nchan, int ninp, int windowBlocks, int nbatch, int bits_per_samp, unsigned char *digitised_data, float *cuda_inp_buf, int debug, int wordtype)

{

  int i;

  static int init = 0, ntoread = 0;

  static unsigned char *cudaBuffer;

  //static unsigned char *buffer;

  static int numThreads = 128;

struct timeval starttime;

gettimeofday( &starttime, NULL );

if( init == 0 )

  {

    ntoread = ninp * nchan * 2 * nbatch * bits_per_samp / 8;

    init = 1;

    cudaMalloc( (void **)&cudaBuffer, ntoread );

    //buffer = (unsigned char *)malloc(ntoread);

    if( debug )

      fprintf( stderr, "size of read buffer: %d bytes\n", ntoread );

  }

struct timeval thetime;

  float cudaCopyTime=0, totalTime=0, unpackTime=0;

//memcpy( buffer, digitised_data, ntoread );

gettimeofday( &thetime, NULL );

  cudaMemcpy( cudaBuffer, digitised_data, ntoread, cudaMemcpyHostToDevice );

  cudaCopyTime += elapsed_time(&thetime);

  /* Copy the last (windowBlocks-1) chunks to the beginning for each stream.

   * 

   */

  for( i = 0; i < ninp; i++ )

    cudaMemcpy( &cuda_inp_buf[i * (nbatch+windowBlocks-1) * nchan * 2], 

	&cuda_inp_buf[i*(nbatch+windowBlocks-1)*nchan*2 + (nbatch)*nchan*2], 

	(windowBlocks-1) * nchan * 2 * sizeof(float), cudaMemcpyDeviceToDevice );

/* Thread number should be multiple of 32 for best efficiency */

  /* Assume nchan to be power of 2 and larger than numThreads */

  dim3 threads( numThreads, 1, 1 );

  dim3 blocks( nchan * 2 / numThreads, nbatch, ninp );

/* cuda_inp_buf needs to be offset by (windowBlocks-1) chunks due to the algorithm design */

  gettimeofday( &thetime, NULL );

  if( wordtype == 0 )

    unpackUnsignedData_kernel<<< blocks, threads >>>(cudaBuffer, &cuda_inp_buf[(windowBlocks-1) * nchan * 2]);

  /* FIXME: Not sure about the correctness of signed data unpacking */

  else if( wordtype == 1 )

    unpackSignedData_kernel<<< blocks, threads >>>(cudaBuffer, &cuda_inp_buf[(windowBlocks-1) * nchan * 2]);

cudaThreadSynchronize();

  unpackTime += elapsed_time(&thetime);

  totalTime += elapsed_time(&starttime);

fprintf( stderr, "cudaMemcpy time: %g, size: %d MB\n", cudaCopyTime, ntoread / 1024 / 1024 );

return 0;

}

And the output looks like:

cudaMemcpy time: 0.787, size: 4 MB

cudaMemcpy time: 22.609, size: 4 MB

cudaMemcpy time: 22.852, size: 4 MB

cudaMemcpy time: 22.861, size: 4 MB

cudaMemcpy time: 22.854, size: 4 MB

cudaMemcpy time: 22.838, size: 4 MB

cudaMemcpy time: 22.832, size: 4 MB

cudaMemcpy time: 22.8329, size: 4 MB

cudaMemcpy time: 22.841, size: 4 MB

cudaMemcpy time: 22.849, size: 4 MB

cudaMemcpy time: 22.857, size: 4 MB

cudaMemcpy time: 22.833, size: 4 MB

cudaMemcpy time: 22.841, size: 4 MB

cudaMemcpy time: 22.826, size: 4 MB

cudaMemcpy time: 22.848, size: 4 MB

cudaMemcpy time: 22.839, size: 4 MB

cudaMemcpy time: 22.848, size: 4 MB

cudaMemcpy time: 22.85, size: 4 MB

cudaMemcpy time: 22.829, size: 4 MB

cudaMemcpy time: 22.843, size: 4 MB

cudaMemcpy time: 22.838, size: 4 MB

cudaMemcpy time: 22.853, size: 4 MB

cudaMemcpy time: 22.833, size: 4 MB

cudaMemcpy time: 22.827, size: 4 MB

cudaMemcpy time: 22.838, size: 4 MB

cudaMemcpy time: 22.83, size: 4 MB

cudaMemcpy time: 22.828, size: 4 MB

cudaMemcpy time: 22.828, size: 4 MB

cudaMemcpy time: 22.846, size: 4 MB

cudaMemcpy time: 22.834, size: 4 MB

cudaMemcpy time: 22.798, size: 4 MB

cudaMemcpy time: 22.784, size: 4 MB

cudaMemcpy time: 22.76, size: 4 MB

cudaMemcpy time: 22.77, size: 4 MB

cudaMemcpy time: 22.779, size: 4 MB

cudaMemcpy time: 22.78, size: 4 MB

cudaMemcpy time: 22.775, size: 4 MB

cudaMemcpy time: 22.778, size: 4 MB

cudaMemcpy time: 22.783, size: 4 MB

cudaMemcpy time: 22.823, size: 4 MB

cudaMemcpy time: 22.805, size: 4 MB

cudaMemcpy time: 22.786, size: 4 MB

cudaMemcpy time: 22.788, size: 4 MB

cudaMemcpy time: 22.787, size: 4 MB

cudaMemcpy time: 22.78, size: 4 MB

cudaMemcpy time: 22.791, size: 4 MB

cudaMemcpy time: 22.779, size: 4 MB

cudaMemcpy time: 22.766, size: 4 MB

cudaMemcpy time: 22.775, size: 4 MB

cudaMemcpy time: 22.833, size: 4 MB

cudaMemcpy time: 22.786, size: 4 MB

cudaMemcpy time: 22.779, size: 4 MB

cudaMemcpy time: 22.779, size: 4 MB

cudaMemcpy time: 22.776, size: 4 MB

cudaMemcpy time: 22.788, size: 4 MB

cudaMemcpy time: 22.767, size: 4 MB

cudaMemcpy time: 22.772, size: 4 MB

cudaMemcpy time: 22.762, size: 4 MB

cudaMemcpy time: 22.771, size: 4 MB

cudaMemcpy time: 22.83, size: 4 MB

cudaMemcpy time: 22.785, size: 4 MB

cudaMemcpy time: 22.768, size: 4 MB

cudaMemcpy time: 22.781, size: 4 MB

cudaMemcpy time: 22.761, size: 4 MB

cudaMemcpy time: 22.772, size: 4 MB

cudaMemcpy time: 22.764, size: 4 MB

cudaMemcpy time: 22.822, size: 4 MB

cudaMemcpy time: 22.778, size: 4 MB

cudaMemcpy time: 22.79, size: 4 MB

cudaMemcpy time: 22.814, size: 4 MB

cudaMemcpy time: 22.795, size: 4 MB

cudaMemcpy time: 22.768, size: 4 MB

cudaMemcpy time: 22.78, size: 4 MB

cudaMemcpy time: 22.776, size: 4 MB

cudaMemcpy time: 22.771, size: 4 MB

cudaMemcpy time: 22.776, size: 4 MB

cudaMemcpy time: 22.803, size: 4 MB

cudaMemcpy time: 22.783, size: 4 MB

cudaMemcpy time: 22.769, size: 4 MB

cudaMemcpy time: 22.801, size: 4 MB

cudaMemcpy time: 22.78, size: 4 MB

cudaMemcpy time: 22.7659, size: 4 MB

cudaMemcpy time: 22.812, size: 4 MB

cudaMemcpy time: 22.827, size: 4 MB

cudaMemcpy time: 22.794, size: 4 MB

cudaMemcpy time: 22.82, size: 4 MB

cudaMemcpy time: 22.826, size: 4 MB

cudaMemcpy time: 22.791, size: 4 MB

cudaMemcpy time: 22.811, size: 4 MB

cudaMemcpy time: 22.842, size: 4 MB

cudaMemcpy time: 22.796, size: 4 MB

cudaMemcpy time: 22.798, size: 4 MB

cudaMemcpy time: 22.787, size: 4 MB

cudaMemcpy time: 22.789, size: 4 MB

cudaMemcpy time: 22.81, size: 4 MB

cudaMemcpy time: 22.773, size: 4 MB

cudaMemcpy time: 22.822, size: 4 MB

cudaMemcpy time: 22.786, size: 4 MB

cudaMemcpy time: 22.81, size: 4 MB

cudaMemcpy time: 22.834, size: 4 MB

cudaMemcpy time: 22.796, size: 4 MB

cudaMemcpy time: 22.776, size: 4 MB

cudaMemcpy time: 22.776, size: 4 MB

cudaMemcpy time: 22.801, size: 4 MB

cudaMemcpy time: 22.783, size: 4 MB

And no, I didn’t use the

+=

operator for the cuda memcpy time.

Or possibly an overflow in the float type?

EDIT: Sorry, I think I’m talking nonsense! Ignore that!

As I figured out the example didn’t demonstrate the issue, I went and look somewhere else that can cause the problem.
Turns out that there’s a kernel that is slowing down the whole program. I had to use cudaThreadSynchronize() to get the accurate timing.