Hello,
I have been noticing that the latency associated with cudaMemcpy calls can be very different from one machine to another. To investigate this, I created the following example which evolves 2D Heat diffusion on a square grid of given edge size, and for a given number of steps. It only has one simple kernel, and optionally I can introduce a (synchronous) cudaMemcpy from device to host and/or from host to device at each time step.
#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <omp.h>
#include <windows.h>
#ifdef DP
#define REAL double
#else
#define REAL float
#endif
__global__ void avg1(
const int nEdge,
const REAL *const yin,
REAL *const yout)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
const int j = blockIdx.y*blockDim.y + threadIdx.y;
if( i > 0 && i < nEdge - 1
&& j > 0 && j < nEdge - 1)
{
const int icc = i*nEdge + j;
const REAL yn1 = yin[icc - nEdge];
const REAL yn2 = yin[icc + nEdge];
const REAL yn3 = yin[icc - 1];
const REAL yn4 = yin[icc + 1];
yout[icc] = (yn1 + yn2 + yn3 + yn4)/4.;
}
}
__global__ void kernelCpy(
REAL *const dest,
REAL *const src,
const int n)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < n)
dest[i] = src[i];
}
void swap(
REAL*& d_y1,
REAL*& d_y2)
{
REAL *const tmp = d_y1;
d_y1 = d_y2;
d_y2 = tmp;
}
void printUsage()
{
printf("Usage: GPUPerformanceXP.exe peerToPeerType memCpy nEdge nSteps devId0 devId1 ...\n");
printf(" peerToPeerType 0 = data transfer via CPU\n");
printf(" peerToPeerType 1 = data transfer using cudaMemcpyDeviceToDevice\n");
printf(" peerToPeerType 2 = data transfer using kernels where peerToPeer access is enabled\n");
printf(" memCpy 0 = no additional memory copies to CPU-GPU\n");
printf(" memCpy 1 = one cudaMemcpyDeviceToHost per step\n");
printf(" memCpy 0 = one cudaMemcpyHostToDevice per step\n");
printf(" memCpy 3 = both 1 and 2\n");
}
#define MAXDEV 16
int main(
const int argc,
const char *const argv[])
{
int peerToPeerType;
int memCopy;
int nEdge;
int nSteps;
int nDevices;
int devIds[MAXDEV];
REAL* y[MAXDEV];
REAL* d_y1[MAXDEV];
REAL* d_y2[MAXDEV];
// ​cudaError_t err;
dim3 nThreadsPerBlock1D(32);
dim3 nBlocks1D;
dim3 nThreadsPerBlock2D(16, 16);
dim3 nBlocks2D;
SYSTEMTIME sysTime1, sysTime2;
nDevices = argc - 5;
if(nDevices < 1)
{
printUsage();
return 0;
}
if(nDevices > MAXDEV)
{
printf("ERROR: Currently this test only supports up to %i devices\n", MAXDEV);
return 0;
}
if(sscanf(argv[1], "%i", &peerToPeerType) != 1)
{
printf("ERROR: Expecting peerToPeerType to be an integer\n");
return 0;
}
if(sscanf(argv[2], "%i", &memCopy) != 1)
{
printf("ERROR: Expecting memCopy to be an integer\n");
return 0;
}
if(sscanf(argv[3], "%i", &nEdge) != 1)
{
printf("ERROR: Expecting nEdge to be an integer\n");
return 0;
}
if(sscanf(argv[4], "%i", &nSteps) != 1)
{
printf("ERROR: Expecting nSteps to be an integer\n");
return 0;
}
for(int i = 0; i < nDevices; i++)
if(sscanf(argv[i + 5], "%i", devIds + i) != 1)
{
printf("ERROR: Expecting devIds to be integers\n");
return 0;
}
if(nEdge < 16 || nEdge > 65536)
{
printf("ERROR: Expecting nEdge to be in range 16 ... 65536\n");
return 0;
}
if(nSteps < 1)
{
printf("ERROR: Expecting nSteps to be > 0\n");
return 0;
}
nBlocks1D.x = (nEdge + nThreadsPerBlock1D.x - 1)/nThreadsPerBlock1D.x;
nBlocks2D.x = (nEdge + nThreadsPerBlock2D.x - 1)/nThreadsPerBlock2D.x;
nBlocks2D.y = (nEdge + nThreadsPerBlock2D.y - 1)/nThreadsPerBlock2D.y;
{
int count;
if(cudaGetDeviceCount(&count) != cudaSuccess)
{
printf("ERROR: Unable to determine number of cuda devices - check GPU driver\n");
return 0;
}
printf("%i cuda devices found\n", count);
for(int i = 0; i < nDevices; i++)
{
if(devIds[i] < 0 || devIds[i] >= count)
{
printf("ERROR: devId[%i] must be in range 0 ... %i\n", i, count - 1);
return 0;
}
}
}
int* peerToPeerAccess = new int[nDevices*nDevices];
for(int i = 0; i < nDevices*nDevices; i++)
peerToPeerAccess[i] = 0;
if(peerToPeerType > 0)
{
for(int i = 0; i < nDevices; i++)
{
int res;
if(i > 0)
{
const int j = i - 1;
int canAccessPeer;
res = cudaDeviceCanAccessPeer(&canAccessPeer, devIds[i], devIds[j]);
if(res != cudaSuccess)
printf("cudaDeviceCanAccessPeer %i to %i failed with return code %i\n", res, devIds[i], devIds[j]);
if(canAccessPeer)
{
printf("enabling peer to peer access dev %i to dev %i\n", devIds[j], devIds[j]);
cudaSetDevice(devIds[i]);
res = cudaDeviceEnablePeerAccess(devIds[j], 0);
if(res == cudaSuccess)
peerToPeerAccess[i*nDevices + j] = canAccessPeer;
else
printf("cudaDeviceEnablePeerAccess failed with return code %i\n", res);
}
else
printf("peer to peer access dev %i to dev %i not possible\n", devIds[i], devIds[j]);
}
if(i < nDevices - 1)
{
const int j = i + 1;
int canAccessPeer;
res = cudaDeviceCanAccessPeer(&canAccessPeer, devIds[i], devIds[j]);
if(res != cudaSuccess)
printf("cudaDeviceCanAccessPeer %i to %i failed with return code %i\n", res, devIds[i], devIds[j]);
if(canAccessPeer)
{
printf("enabling peer to peer access dev %i to dev %i\n", devIds[i], devIds[j]);
cudaSetDevice(devIds[i]);
res = cudaDeviceEnablePeerAccess(devIds[j], 0);
if(res == cudaSuccess)
peerToPeerAccess[i*nDevices + j] = canAccessPeer;
else
printf("cudaDeviceEnablePeerAccess failed with return code %i\n", res);
}
else
printf("peer to peer access dev %i to dev %i not possible\n", devIds[i], devIds[j]);
}
}
}
printf("peerToPeerAccess:\n");
for(int i = 0; i < nDevices; i++)
{
printf("%i", peerToPeerAccess[i*nDevices + 0]);
for(int j = 1; j < nDevices; j++)
printf(" %i", peerToPeerAccess[i*nDevices + j]);
printf("\n");
}
for(int tid = 0; tid < nDevices; tid++)
{
if(cudaSetDevice(devIds[tid]) != cudaSuccess)
{
printf("ERROR: Unable to setlect device %i\n", devIds[tid]);
return 0;
}
cudaMallocHost(y + tid, nEdge*nEdge*sizeof(REAL));
cudaMalloc(d_y1 + tid, nEdge*nEdge*sizeof(REAL));
cudaMalloc(d_y2 + tid, nEdge*nEdge*sizeof(REAL));
if(d_y1 == NULL || d_y2 == NULL)
{
printf("ERROR: Unable to allocate memory on device %i\n", devIds[tid]);
return 0;
}
}
omp_set_num_threads(nDevices);
#pragma omp parallel
{
const int tid = omp_get_thread_num();
cudaSetDevice(devIds[tid]);
printf("Initialising host memory for thread %i\n", tid);
memset(y[tid], 0, nEdge*nEdge*sizeof(REAL));
if(tid == nDevices - 1)
for(int j = 0; j < nEdge; j++)
y[tid][(nEdge - 1)*nEdge + j] = 1.0;
printf("Copying data to GPU %i\n", devIds[tid]);
cudaMemcpy(d_y2[tid], y[tid], nEdge*nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
cudaMemcpy(d_y1[tid], y[tid], nEdge*nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
}
GetSystemTime(&sysTime1);
#pragma omp parallel
{
const int tid = omp_get_thread_num();
for(int iStep = 0; iStep < nSteps; iStep++)
{
avg1<<<nBlocks2D, nThreadsPerBlock2D>>>(nEdge, d_y1[tid], d_y2[tid]);
cudaStreamSynchronize(NULL);
#pragma omp barrier
switch(peerToPeerType)
{
case 0:
// copy via host RAM
if(tid < nDevices - 1)
cudaMemcpy(y[tid] + (nEdge - 2)*nEdge, d_y2[tid] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToHost);
if(tid > 0)
cudaMemcpy(y[tid] + 1 *nEdge, d_y2[tid] + 1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToHost);
#pragma omp barrier
if(tid < nDevices - 1)
cudaMemcpy(d_y2[tid] + (nEdge - 1)*nEdge, y[tid + 1] + 1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
if(tid > 0)
cudaMemcpy(d_y2[tid] + 0 *nEdge, y[tid - 1] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyHostToDevice);
break;
case 1:
// cudaMemcpyDeviceToDevice
if(tid < nDevices - 1)
cudaMemcpy(d_y2[tid] + (nEdge - 1)*nEdge, d_y2[tid + 1] + 1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
if(tid > 0)
cudaMemcpy(d_y2[tid] + 0 *nEdge, d_y2[tid - 1] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
break;
case 2:
// use kernel to copy
if(tid < nDevices - 1)
{
if(peerToPeerAccess[tid*nDevices + tid + 1])
kernelCpy<<<nBlocks1D, nThreadsPerBlock1D>>>(d_y2[tid] + (nEdge - 1)*nEdge, d_y2[tid + 1] + 1 *nEdge, nEdge);
else
cudaMemcpy(d_y2[tid] + (nEdge - 1)*nEdge, d_y2[tid + 1] + 1 *nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
}
if(tid > 0)
{
if(peerToPeerAccess[tid*nDevices + tid - 1])
kernelCpy<<<nBlocks1D, nThreadsPerBlock1D>>>(d_y2[tid] + 0 *nEdge, d_y2[tid - 1] + (nEdge - 2)*nEdge, nEdge);
else
cudaMemcpy(d_y2[tid] + 0 *nEdge, d_y2[tid - 1] + (nEdge - 2)*nEdge, nEdge*sizeof(REAL), cudaMemcpyDeviceToDevice);
}
break;
default:
break;
}
if((memCopy & 0x00000001) > 0)
cudaMemcpy( y[tid], d_y2[tid], sizeof(REAL), cudaMemcpyDeviceToHost);
if((memCopy & 0x00000002) > 0)
cudaMemcpy(d_y1[tid], y[tid], sizeof(REAL), cudaMemcpyHostToDevice);
swap(d_y1[tid], d_y2[tid]);
}
cudaMemcpy(y[tid], d_y1[tid], nEdge*nEdge*sizeof(REAL), cudaMemcpyDeviceToHost);
}
GetSystemTime(&sysTime2);
{
int wallTime = 3600000*(int)(sysTime2.wHour - sysTime1.wHour);
wallTime += 60000*(int)(sysTime2.wMinute - sysTime1.wMinute);
wallTime += 1000*(int)(sysTime2.wSecond - sysTime1.wSecond);
wallTime += (int)(sysTime2.wMilliseconds - sysTime1.wMilliseconds);
printf("Wall time = %i ms\n", wallTime);
}
printf("\n");
for(int tid = 0; tid < nDevices; tid++)
{
cudaSetDevice(devIds[tid]);
cudaFree(d_y1[tid]);
cudaFree(d_y2[tid]);
cudaFreeHost(y[tid]);
}
return 0;
}
I run the model with different size grids, and measuring wall time. Plotting time vs number of cells produces a near linear plot, where the y-intercept can be considered the latency associated with kernel launches. When adding in the synchronous cudaMemcpys, the y-intercept increases as expected, with the delta being the latency associated with the copy. Note that the copy is only moving one 4-byte float, so bandwidth is unimportant - the code is only testing the latency.
Now the interesting thing is when I run this test on some different machines and determine the delta associated with introducing the cudaMemcpys, I get:
The best results are from the two linux machines. The 6 Windows machines are not so good, probably because of the WDDM driver model (and I can’t put those cords into TCC). But two of the Windows machines are showing latencies that are 10x that of the Linux machines, and 3x worse than some of the other Windows machines. This is hurting me for small models and multi-GPU models with halo exchanges over PCIe bus. What factors or settings could be causing this?
Greg.