Dear all,
I wrote 2 kernels to see how much atomicAdd() to shared memory is faster than global memory.
Kernel is simple: Just keep add from i=0 … ITER-1, under 16 threads in a block across 256/16 blocks.
The result I cannot understand is:
atomicAdd to Shared memory - 140ms
atomicAdd to Direct to Global memory- 90ms
It would be so appreciated if you drop a line.
SK.
Here’s simple codes:
#define WARP_WIDTH 16
#define W 256
#define ITER 1000000
///////////////AtomicAdd to Shared memory ‘shd’//////////////////
global void kernel_shdatm(int* in, int* out)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
shared int shd[WARP_WIDTH];
shd[threadIdx.x] = in[j];
int i;
for(i=0;i<ITER;i++)
atomicAdd((int*)&(shd[threadIdx.x]), i );
out[j] =shd[threadIdx.x];
__syncthreads();
return;
}
///////////////AtomicAdd to global memory ‘out’//////////////////
global void kernel_glbatm(int* in, int* out)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
int i;
for(i=0;i<ITER;i++)
atomicAdd((int*)&(out[j]), i);
__syncthreads();
return;
}
////////////////////////////////////////////////////////////////////
// kernel call
// to shared memory->global memory copy
kernel_shdatm<<<W/WARP_WIDTH, WARP_WIDTH>>>(g_in, g_out);
// to global memory directly.
kernel_glbatm<<<W/WARP_WIDTH, WARP_WIDTH>>>(g_ing, g_outg);
The time is measured by surrounding each of them including memory alloc/copy with cudaEvent…().