Hi,
I wanted to test the difference in speed of using shared mem compared with global mem. And despite having quite little computations done, there is almost no difference. I wonder if this could really be true or if I had generally made a bad mistake. It even works for a small number of blocks and threads per block.
The first method “globalReadTest()” just starts the test.
The method “globalReadEach()” reads/writes one float from/to global mem.
The method “globalReadNone()” does not read/write from/to global mem.
Here`s the code:
void globalReadTest() { //method to test the speed to access shard and global //memory
int blockNum=16;
int threadNum=32;
CUT_DEVICE_INIT();
float* data;
cudaMalloc((void**)&data, 1024*1024*10); // allocate some global mem
int iter=2000; //number of iterations for the test
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
for (int u=0; u<iter; u++) { //within this loop, a method is invoked,
//accessing global mem within each thread
globalReadEach<<<blockNum, threadNum, 4*threadNum>>>(data);
}
CUT_SAFE_CALL( cutStopTimer( timer));
float timerDiff=cutGetTimerValue( timer);
CUT_SAFE_CALL( cutDeleteTimer( timer));
unsigned int timer2 = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer2));
CUT_SAFE_CALL( cutStartTimer( timer2));
for (int u=0; u<iter; u++) { within this loop, a method is invoked,
//without accessing global mem at all
globalReadNone<<<blockNum, threadNum, 4*threadNum>>>(data);
}
CUT_SAFE_CALL( cutStopTimer( timer2));
float timerDiff2=cutGetTimerValue( timer2);
CUT_SAFE_CALL( cutDeleteTimer( timer2));
cudaFree((void**)&data);
printf("Duration globalReadEach: %f\nDuration globalReadNone: %f\nA small difference in time might be caused by the order of execution!\n", timerDiff, timerDiff2);
}
global void globalReadEach(float* global) { //one read and write access to //global mem
extern shared float shared;
int myPos= blockDim.x * blockIdx.x + threadIdx.x; //coalesced position
//for global mem access
shared[ threadIdx.x ]=global[ myPos ]; //read from global mem
__syncthreads();
float erg=0;
calc(&erg); //some calculations
global[myPos]=erg; //write to global mem
}
global void globalReadNone(float* global) { //no global mem access
extern shared float shared;
shared[ threadIdx.x ]= myPos;
__syncthreads();
float erg=0;
calc(&erg); //some calculations
}
static device void calc(float* erg) { //just some calculations
extern shared float shared;
for (int b=0; b<1; b++) {
for (int i=0; i< 3/blockDim.x/; i++ ) {
erg=sin(shared[i])*cos(shared[i])/i/sin((float)b);
}
}
}