Inconsistent cudaMemcpy Timing cudaMemcpy and kernel timing hiccups at 1 second intervals

I’m using cudaMemcpy to transfer images to/from the GPU for a real-time image processing application. The images can be 2MB or 4MB per image. The problem I’m having is that cudaMemcpy nominally takes about 1ms for the 2MB image, but every once in a while it takes significantly longer (~15ms).

I’m running on Linux 64-bit (Fedora 13) with a Tesla S1070 and a Quadro FX 5800 with the latest driver (256.53). The intermitent timing persists when using either of the devices.

I wrote a memcpyTimeTest application that simply copies 2MB of data from the device to host over and over again to make sure that it wasn’t any of my application code causing the problems:

[codebox]#include <cuda_runtime_api.h>

#include <stdio.h>

#include <stdlib.h>

void printElapsedTimes( float *elapsedTimes_ms, int n )

{

float maxET = 0;

float avgET = 0;

FILE *fd = fopen(“CudaMemcpyTimes.csv”, “wt”);

fprintf(fd, “cudaMemcpy_dtoh (ms)\n”);

for (int i=0; i < n; ++i)

{

  fprintf(fd, "%0.2f\n", elapsedTimes_ms[i]);

avgET += elapsedTimes_ms[i] / n;

if (elapsedTimes_ms[i] > maxET)

     maxET = elapsedTimes_ms[i];

}

fclose(fd);

printf(“Avg ElapsedTime = %0.2fms\n”, avgET);

printf(“Max ElapsedTime = %0.2fms\n”, maxET);

}

void myMemcpyTimeTest()

{

unsigned char *hostMem;

unsigned char *devMem;

const int memSize = 2 << 20;

const int NUM_ITERATIONS = 20000;

float *elapsedTimes_ms = new float [NUM_ITERATIONS];

float elapsedTime_ms;

cudaEvent_t start, stop;

cudaEventCreate( &start );

cudaEventCreate( &stop );

cudaHostAlloc( (void**)&hostMem, memSize, 0 );

cudaMalloc( (void**)&devMem, memSize );

printf(“Running myMemcpyTimeTest (%d MB chunks)\n”, memSize >> 20);

for (int i=0; i < NUM_ITERATIONS; ++i)

{

  cudaEventRecord( start, 0 );

  cudaMemcpy( hostMem, devMem, memSize, cudaMemcpyDeviceToHost);

  cudaEventRecord( stop, 0 );

cudaThreadSynchronize();

cudaEventElapsedTime( &elapsedTime_ms, start, stop );

elapsedTimes_ms[i] = elapsedTime_ms;

}

//clean up memory

cudaEventDestroy(stop);

cudaEventDestroy(start);

cudaFreeHost(hostMem);

cudaFree(devMem);

printElapsedTimes( elapsedTimes_ms, NUM_ITERATIONS );

delete elapsedTimes_ms;

}

int main(int argc, char** argv)

{

if (argc > 1)

{

  int gpuDev = atoi(argv[1]);

  cudaSetDevice(gpuDev);

  printf("Using GPU device %d\n", gpuDev);

}

myMemcpyTimeTest();

return 0;

}

[/codebox]

One thing I’ve determined is that the timing “hiccups” occur at approximately one second intervals, meaning there is one cudaMemcpy() transfer that takes ~15ms every second whereas all of the other copies take 1ms.

I’ve repeated the problem using a Windows XP 32-bit machine with a Quadro FX 3800, however the timing hiccups occur every half second instead of every second.

I have also seen these timing hiccups with my CUDA kernels as well.

What could possibly be running with a one second interval and how can I disable it?

Thanks.

I’m using cudaMemcpy to transfer images to/from the GPU for a real-time image processing application. The images can be 2MB or 4MB per image. The problem I’m having is that cudaMemcpy nominally takes about 1ms for the 2MB image, but every once in a while it takes significantly longer (~15ms).

I’m running on Linux 64-bit (Fedora 13) with a Tesla S1070 and a Quadro FX 5800 with the latest driver (256.53). The intermitent timing persists when using either of the devices.

I wrote a memcpyTimeTest application that simply copies 2MB of data from the device to host over and over again to make sure that it wasn’t any of my application code causing the problems:

[codebox]#include <cuda_runtime_api.h>

#include <stdio.h>

#include <stdlib.h>

void printElapsedTimes( float *elapsedTimes_ms, int n )

{

float maxET = 0;

float avgET = 0;

FILE *fd = fopen(“CudaMemcpyTimes.csv”, “wt”);

fprintf(fd, “cudaMemcpy_dtoh (ms)\n”);

for (int i=0; i < n; ++i)

{

  fprintf(fd, "%0.2f\n", elapsedTimes_ms[i]);

avgET += elapsedTimes_ms[i] / n;

if (elapsedTimes_ms[i] > maxET)

     maxET = elapsedTimes_ms[i];

}

fclose(fd);

printf(“Avg ElapsedTime = %0.2fms\n”, avgET);

printf(“Max ElapsedTime = %0.2fms\n”, maxET);

}

void myMemcpyTimeTest()

{

unsigned char *hostMem;

unsigned char *devMem;

const int memSize = 2 << 20;

const int NUM_ITERATIONS = 20000;

float *elapsedTimes_ms = new float [NUM_ITERATIONS];

float elapsedTime_ms;

cudaEvent_t start, stop;

cudaEventCreate( &start );

cudaEventCreate( &stop );

cudaHostAlloc( (void**)&hostMem, memSize, 0 );

cudaMalloc( (void**)&devMem, memSize );

printf(“Running myMemcpyTimeTest (%d MB chunks)\n”, memSize >> 20);

for (int i=0; i < NUM_ITERATIONS; ++i)

{

  cudaEventRecord( start, 0 );

  cudaMemcpy( hostMem, devMem, memSize, cudaMemcpyDeviceToHost);

  cudaEventRecord( stop, 0 );

cudaThreadSynchronize();

cudaEventElapsedTime( &elapsedTime_ms, start, stop );

elapsedTimes_ms[i] = elapsedTime_ms;

}

//clean up memory

cudaEventDestroy(stop);

cudaEventDestroy(start);

cudaFreeHost(hostMem);

cudaFree(devMem);

printElapsedTimes( elapsedTimes_ms, NUM_ITERATIONS );

delete elapsedTimes_ms;

}

int main(int argc, char** argv)

{

if (argc > 1)

{

  int gpuDev = atoi(argv[1]);

  cudaSetDevice(gpuDev);

  printf("Using GPU device %d\n", gpuDev);

}

myMemcpyTimeTest();

return 0;

}

[/codebox]

One thing I’ve determined is that the timing “hiccups” occur at approximately one second intervals, meaning there is one cudaMemcpy() transfer that takes ~15ms every second whereas all of the other copies take 1ms.

I’ve repeated the problem using a Windows XP 32-bit machine with a Quadro FX 3800, however the timing hiccups occur every half second instead of every second.

I have also seen these timing hiccups with my CUDA kernels as well.

What could possibly be running with a one second interval and how can I disable it?

Thanks.