When running some CUDA code on small block numbers I noticed atomicAdd operations were performing strangely better than += operations in the same code and wrote a simple CUDA code block to test the speed difference of the 2 operations with a single block (cycle.h controls the timers):
//includes
#include "cycle.h"
__device__ int seed=6166;
clock_t tic;
clock_t toc;
enum {
tid_atomic = 0,
tid_plus_equal,
tid_count
};
__device__ float data[1032];
#define TIMER_TIC tic=getticks();
#define TIMER_TOC(tid) toc = getticks(); timers[tid]+= (( toc > tic ) ? (toc - tic) : ( toc + (0xffffffff - tic) ) );
__global__ void addAtomic()
{
int seed = 12345;
int threadID = threadIdx.x;
int k, i;
for ( k = 0 ; k < 10000000 ; k++ ) {
seed = 1103515245 * seed + 12345;
i = seed % 1000;
atomicAdd(&data[ i + threadID ] , i * 0.1f);
}
}
__global__ void addPlusEqual ( ) {
int seed = 12345;
int threadID = threadIdx.x;
int k, i;
for ( k = 0 ; k < 10000000 ; k++ ) {
seed = 1103515245 * seed + 12345;
i= seed % 1000;
data[ i + threadID ] += i * 0.1f;
}
}
int main()
{
float timers[2] = {0.0,0.0};
int i = 0;
dim3 dimBlock( 32 , 1, 1 );
dim3 dimGrid( 1, 1, 1 );
addAtomic();
if ( cudaDeviceSynchronize() != cudaSuccess )
printf("Error
");
TIMER_TIC
addAtomic();
if ( cudaDeviceSynchronize() != cudaSuccess )
printf("Error
");
TIMER_TOC(tid_atomic)
TIMER_TIC
addPlusEqual();
if ( cudaDeviceSynchronize() != cudaSuccess )
printf("Error
");
TIMER_TOC(tid_plus_equal)
for( i = 0; i < tid_count; i++ )
{
printf("%.3f ", timers[i]/2666.7e6);
}
printf("
");
}
The result of this code is:
0.982 3.022
ie. the atomicAdd is roughly 3 times faster than +=, presumably due to extra memory reads.
What makes it more interesting is if we change
data[ i + threadID ] += i * 0.1f;
to
data[ i + 3 threadID ] += i * 0.1f;
the time for += increases to 3.961s, so coalescing memory helps somewhat.
However if we add (presumably) a single read and local write to both with:
seed = atomicAdd(&data[ i + threadID ] , i * 0.1f);
and
seed = (data[i+threadID] +=i0.1);
the times become 11.945s and 14.032s - suggesting the costs of a single read + write to local costs more than 11s over each run - which is significantly larger than the original difference between atomicAdd and +=.
Does anyone know how/why these two operations differ or have a method to speed up code that uses +=, similar to using an atomicAdd for a single block but over many blocks non-atomically?