Great idea, Sylvain! And it works for signed short, too:
__device__ short atomicAddShort(short* address, short val)
{
unsigned int *base_address = (unsigned int *)((size_t)address & ~2);
unsigned int long_val = ((size_t)address & 2) ? ((unsigned int)val << 16) : (unsigned short)val;
unsigned int long_old = atomicAdd(base_address, long_val);
if((size_t)address & 2) {
return (short)(long_old >> 16);
} else {
unsigned int overflow = ((long_old & 0xffff) + long_val) & 0xffff0000;
if (overflow)
atomicSub(base_address, overflow);
return (short)(long_old & 0xffff);
}
}