Hi
I just found a performance bottleneck in my code when running it on multiple GPUs on a big cluster.
After some profiling effort I pinned the problem down to ECC memory being used.
The culprit are cudaMemcpyToSymbol instructions which are used quiet heavily and whose number is constant for each process (basically they are copying single ints and floats).
I wrote a little testprogram (see further down) and did some tests on two clusters and a workstation. The basic conclusion is that with ECC activated copy operations seem to take 4 times as long as without ECC - note though that this is mostly a latency thing due to copying single integers and such.
The output is:
NICS Keeneland (M2050 ECC on):
Start
no error
Done Const: 2.169880e-05
Done Mem: 2.027810e-05 (BW: 1.881191e-01)
eStella (C2050 ECC on):
Start
no error
Done Const: 1.934510e-05
Done Mem: 1.817390e-05 (BW: 2.098998e-01)
eStella (C2050 ECC off):
Start
no error
Done Const: 5.480500e-06
Done Mem: 5.147300e-06 (BW: 7.411065e-01)
Workstation (GTX470 no ECC):
Start
no error
Done Const: 5.013000e-06
Done Mem: 4.917500e-06 (BW: 7.757391e-01)
Is a performance hit like that expected?
Cheers
Christian
Code (compiled with “nvcc -arch=sm_20 -lrt test.cu”):
#include <cstdio>
#include "cuda_runtime.h"
#include <ctime>
__device__ __constant__ int c;
void UpdateConst(int a)
{
cudaMemcpyToSymbol("c", &a, sizeof(int));
}
void UpdateMem(int* d_a, int* a,int size)
{
cudaMemcpy(d_a,a,size*sizeof(int),cudaMemcpyHostToDevice);
}
int main()
{
printf("Start\n");
int size=1;
int loopsC=10000;
int loopsD=10000;
cudaThreadSynchronize();
int* d_a;
cudaMalloc(&d_a,size*sizeof(int));
int* a=new int;
printf("%s\n",cudaGetErrorString(cudaGetLastError()));
timespec time1,time2;
UpdateConst(0);
clock_gettime(CLOCK_REALTIME,&time1);
for(int i=0;i<loopsC;i++)
UpdateConst(i);
clock_gettime(CLOCK_REALTIME,&time2);
double time=time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
printf("Done Const: %e\n",time/loopsC);
UpdateMem(d_a,a,size);
clock_gettime(CLOCK_REALTIME,&time1);
for(int i=0;i<loopsD;i++)
UpdateMem(d_a,a,size);
clock_gettime(CLOCK_REALTIME,&time2);
time=time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
printf("Done Mem: %e (BW: %e)\n",time/loopsD,loopsD*size*sizeof(int)/time/1024/1024);
return 0;
}