atomicAdd(float*,float) - atomicMul(float*,float) ...

Hi,

we need to build sum of float data in a grid, but cuda has no atomicAdd(float*,float). I have search in this forum but found only one slow solution.

I know that ( A+B )+C != A+(B+C) if all data are float.

So I build this for me, but would give this code to all for solve related problems.

This implementation can change to to mul,sub,div,…

I have run a little test to check the speed of this method, i sum all number from 1.0 till 256000.0, than I do the same with a atomicAdd(int*,int) and show the runtime.

FORUM = implementation in this forum

MY = this implementation (see bottom)

INT = implementation with atomicAdd(int*,int)

MY | FORUM | INT


50ms | 34862ms| 8ms

MY atomicAdd() implementation is 6 times slower as atomicAdd(int).

I call kernel myAdd with this parameter. “out” is a pointer to one float value and “in” is a array with 256000 value ( 1 till 255999)

myAdd<<<100,256>>>(out,in);

A implementation which return old data

__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  float ret=atomicExch(address, 0.0f);

  float new_old=ret+old;

while ((old = atomicExch(address, new_old))!=0.0f)

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  return ret;

}

__global__ void myAdd(T *out,T* a  )

{

   atomicAdd(out,a[blockIdx.x*blockDim.x+threadIdx.x]);

};

A implementation without return (need not so many register)

__device__ inline void atomicAdd(float* address, float value)

{

  float old = value;  

  float new_old;

do

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  while ((old = atomicExch(address, new_old))!=0.0f);

};

I have test this atomic function with a helper shared variable and this is the result.

Run with the same setting as in previous post.

MY		|  FORUM |  INT

--------------------------------------------

0.417ms | 2.820ms | 0.101ms

Now atomicAdd() implementation is only 4 times slower as atomicAdd(int).

__global__ void myAdd(T *out,T* a  )

{

   __shared__ float myShare;

   if(threadIdx.x==0) myShare=0.0f;

   __syncthreads();

atomicAdd(&myShare,a[blockIdx.x*blockDim.x+threadIdx.x]);

__syncthreads();

if(threadIdx.x==0) atomicAdd(out,myShare);

}

How can that last piece of code you posted, which is doing a shared memory atomic add to a float variable, possibly compile, let alone run?

No this wars only a part of the code.

Compile code with

nvcc main.cu -arch sm_13

to test atomicAdd(int*,int)

nvcc main.cu -arch sm_13 -DFLOAT

to test with atomicAdd(float*,float)

nvcc main.cu -arch sm_13 -DFLOAT -DFORUM

to test with float and version from forum

full code:

#include <cuda.h>

#include <stdio.h>

#ifdef FLOAT

  #define T float

#else

  #define T int

#endif

#ifdef FORUM

__device__ inline void atomicAdd(float *address, float val)

{

	  int i_val = __float_as_int(val);

	  int tmp0 = 0;

	  int tmp1;

	  while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0)

	  {

			  tmp0 = tmp1;

			  i_val = __float_as_int(val + __int_as_float(tmp1));

	  }

}

#else

__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  float ret=atomicExch(address, 0.0f);

  float new_old=ret+old;

  while ((old = atomicExch(address, new_old))!=0.0f)

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  return ret;

};

#endif

void checkCUDAError(const char *msg);

__global__ void myAdd(T *out,T* a  )

{

   __shared__ T myShare;

   if(threadIdx.x==0) myShare=0.0f;

   __syncthreads();

   atomicAdd(&myShare,a[blockIdx.x*blockDim.x+threadIdx.x]);

   __syncthreads(); 

   if(threadIdx.x==0) atomicAdd(out,myShare);

}

int main( int argc, char** argv) 

{

	float elapsedTimeInMs = 0.0f;

	cudaEvent_t start, stop;

	cudaEventCreate( &start );

	cudaEventCreate( &stop);

	// pointer for host memory

	T *h_a;

	T *h_out;

	// pointer for device memory

	T *d_a;

	T *d_out;

	int numBlocks = 100;

	int numThreadsPerBlock = 256;

	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(T);

	h_a = (T *) malloc(memSize);

	h_out = (T *) malloc(sizeof(T));

	cudaMalloc( (T**) &d_a,memSize  );

	cudaMalloc( (T**) &d_out,sizeof(T));

	T x=0;

	for(int i=0; i< (numBlocks*numThreadsPerBlock);i++)

	{

	   h_a[i]=(T)(i+1);

	   x+=(T)(i+1);

	} 

	cudaMemcpy(d_a,h_a,memSize,cudaMemcpyHostToDevice);

	*h_out=0.0f;

	cudaMemcpy(d_out,h_out,sizeof(T),cudaMemcpyHostToDevice);

	dim3 dimGrid(numBlocks  );

	dim3 dimBlock(numThreadsPerBlock  );

	cudaEventRecord( start, 0 );

	myAdd<<<dimGrid  ,dimBlock>>>(d_out,d_a);

	cudaEventRecord( stop, 0 );

	cudaThreadSynchronize();

	cudaEventElapsedTime( &elapsedTimeInMs, start, stop );

	cudaMemcpy(h_out,d_out,sizeof(T),cudaMemcpyDeviceToHost  );

#ifdef FLOAT

	printf("out: %f cpu: %f, time=%.6fms\n",*h_out,x,elapsedTimeInMs);

#else

	printf("out: %d cpu: %d, time=%.6fms\n",*h_out,x,elapsedTimeInMs);

#endif

	// free device memory

	cudaFree(d_a);

	cudaFree(d_out);

	// free host memory

	free(h_a);

	free(h_out);

	return 0;

}

I have a update to the atomicAdd without and with return value.

with return code (use 2 register)

__device__ inline float atomicAdd(float* address, float value)

{

  float ret=atomicExch(address, 0.0f);

  float old = ret+value;  

  while ((old = atomicExch(address, old))!=0.0f)

  {

	old = atomicExch(address, 0.0f)+old;

  }

  return ret;

};

without return code (use 1 register)

__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);

};

I think atomic float addition is perhaps the #1 FAQ on the CUDA forums… either that, or “why doesn’t AtomicAdd link?” (Which is because you need teh nvcc compute level switch set.)

Here’s a thread from a year and a half ago with the same “exchange with 0.0, keep swapping in the sum until you get 0.0 back” strategy.

Ohh, thx I haven’t found this thread, therefore I had add this.

do these atomicAdd for floats still function on CUDA 3.0 and Fermi? I compiled the above codes with CUDA 3.1 and run the generated binary on GTX 470, I got “invalid device function” error at run-time. But if I use atomicAdd() for compute_20 devices, it worked fine.

People do know that compute 2.0 added an intrinsic float atomic add, right?

CUDA programming guide 3.1 - B.11.1.1

yes, but I want to make my code back-ward compatible with non-Fermi hardware. When running a kernel on non-Fermi cards, can I fall back to the above atomicAdd?

Also works for double precision variables? Thanks

I am very surprised, for floats, the hacked atomicFloatAdd is faster (~10%) than the native one on GTX 470.

What is the collision probability in that case? I can imagine that the native atomic float addition could be faster when the probability of two threads accessing the same float is high.

I got really confused after reading Section 3.1.4 in the programming guide, and not even sure which version of atomic I am executing.

The following is my atomicadd kernel

__device__ inline void atomicadd(float* address, float value){

#if __CUDA_ARCH__ >= 200 // for Fermi, atomicAdd supports floats

  atomicAdd(address,value);

#elif __CUDA_ARCH__ >= 110

// float-atomic-add from 

// <a target='_blank' rel='noopener noreferrer' href='"http://forums.nvidia.com/index.php?showtopic=158039&view=findpost&p=991561"'>http://forums.nvidia.com/index.php?showtop...st&p=991561</a>

  float old = value;

  while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);

#endif

}

then I compiled with -arch=sm_11, and then I run the generated binary with GTX470 (fermi).

can anyone tell me what happens when I compile the code with sm_11? am I executing the JIT version of the PTX of the second half?