atomicAdd(float,float) - atomicMul(float,float) ...

psychocoder · January 29, 2010, 2:02pm

Hi,

we need to build sum of float data in a grid, but cuda has no atomicAdd(float*,float). I have search in this forum but found only one slow solution.

I know that ( A+B )+C != A+(B+C) if all data are float.

So I build this for me, but would give this code to all for solve related problems.

This implementation can change to to mul,sub,div,…

I have run a little test to check the speed of this method, i sum all number from 1.0 till 256000.0, than I do the same with a atomicAdd(int*,int) and show the runtime.

FORUM = implementation in this forum

MY = this implementation (see bottom)

INT = implementation with atomicAdd(int*,int)

MY | FORUM | INT

50ms | 34862ms| 8ms

MY atomicAdd() implementation is 6 times slower as atomicAdd(int).

I call kernel myAdd with this parameter. “out” is a pointer to one float value and “in” is a array with 256000 value ( 1 till 255999)

myAdd<<<100,256>>>(out,in);

A implementation which return old data

__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  float ret=atomicExch(address, 0.0f);

  float new_old=ret+old;

while ((old = atomicExch(address, new_old))!=0.0f)

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  return ret;

}

__global__ void myAdd(T *out,T* a  )

{

   atomicAdd(out,a[blockIdx.x*blockDim.x+threadIdx.x]);

};

A implementation without return (need not so many register)

__device__ inline void atomicAdd(float* address, float value)

{

  float old = value;  

  float new_old;

do

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  while ((old = atomicExch(address, new_old))!=0.0f);

};

psychocoder · January 29, 2010, 3:29pm

I have test this atomic function with a helper shared variable and this is the result.

Run with the same setting as in previous post.

MY		|  FORUM |  INT

--------------------------------------------

0.417ms | 2.820ms | 0.101ms

Now atomicAdd() implementation is only 4 times slower as atomicAdd(int).

__global__ void myAdd(T *out,T* a  )

{

   __shared__ float myShare;

   if(threadIdx.x==0) myShare=0.0f;

   __syncthreads();

atomicAdd(&myShare,a[blockIdx.x*blockDim.x+threadIdx.x]);

__syncthreads();

if(threadIdx.x==0) atomicAdd(out,myShare);

}

avidday · January 29, 2010, 3:41pm

How can that last piece of code you posted, which is doing a shared memory atomic add to a float variable, possibly compile, let alone run?

psychocoder · January 29, 2010, 4:39pm

No this wars only a part of the code.

Compile code with

nvcc main.cu -arch sm_13

to test atomicAdd(int*,int)

nvcc main.cu -arch sm_13 -DFLOAT

to test with atomicAdd(float*,float)

nvcc main.cu -arch sm_13 -DFLOAT -DFORUM

to test with float and version from forum

full code:

#include <cuda.h>

#include <stdio.h>

#ifdef FLOAT

  #define T float

#else

  #define T int

#endif

#ifdef FORUM

__device__ inline void atomicAdd(float *address, float val)

{

	  int i_val = __float_as_int(val);

	  int tmp0 = 0;

	  int tmp1;

	  while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0)

	  {

			  tmp0 = tmp1;

			  i_val = __float_as_int(val + __int_as_float(tmp1));

	  }

}

#else

__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  float ret=atomicExch(address, 0.0f);

  float new_old=ret+old;

  while ((old = atomicExch(address, new_old))!=0.0f)

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  return ret;

};

#endif

void checkCUDAError(const char *msg);

__global__ void myAdd(T *out,T* a  )

{

   __shared__ T myShare;

   if(threadIdx.x==0) myShare=0.0f;

   __syncthreads();

   atomicAdd(&myShare,a[blockIdx.x*blockDim.x+threadIdx.x]);

   __syncthreads(); 

   if(threadIdx.x==0) atomicAdd(out,myShare);

}

int main( int argc, char** argv) 

{

	float elapsedTimeInMs = 0.0f;

	cudaEvent_t start, stop;

	cudaEventCreate( &start );

	cudaEventCreate( &stop);

	// pointer for host memory

	T *h_a;

	T *h_out;

	// pointer for device memory

	T *d_a;

	T *d_out;

	int numBlocks = 100;

	int numThreadsPerBlock = 256;

	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(T);

	h_a = (T *) malloc(memSize);

	h_out = (T *) malloc(sizeof(T));

	cudaMalloc( (T**) &d_a,memSize  );

	cudaMalloc( (T**) &d_out,sizeof(T));

	T x=0;

	for(int i=0; i< (numBlocks*numThreadsPerBlock);i++)

	{

	   h_a[i]=(T)(i+1);

	   x+=(T)(i+1);

	} 

	cudaMemcpy(d_a,h_a,memSize,cudaMemcpyHostToDevice);

	*h_out=0.0f;

	cudaMemcpy(d_out,h_out,sizeof(T),cudaMemcpyHostToDevice);

	dim3 dimGrid(numBlocks  );

	dim3 dimBlock(numThreadsPerBlock  );

	cudaEventRecord( start, 0 );

	myAdd<<<dimGrid  ,dimBlock>>>(d_out,d_a);

	cudaEventRecord( stop, 0 );

	cudaThreadSynchronize();

	cudaEventElapsedTime( &elapsedTimeInMs, start, stop );

	cudaMemcpy(h_out,d_out,sizeof(T),cudaMemcpyDeviceToHost  );

#ifdef FLOAT

	printf("out: %f cpu: %f, time=%.6fms\n",*h_out,x,elapsedTimeInMs);

#else

	printf("out: %d cpu: %d, time=%.6fms\n",*h_out,x,elapsedTimeInMs);

#endif

	// free device memory

	cudaFree(d_a);

	cudaFree(d_out);

	// free host memory

	free(h_a);

	free(h_out);

	return 0;

}

psychocoder · January 29, 2010, 5:30pm

I have a update to the atomicAdd without and with return value.

with return code (use 2 register)

__device__ inline float atomicAdd(float* address, float value)

{

  float ret=atomicExch(address, 0.0f);

  float old = ret+value;  

  while ((old = atomicExch(address, old))!=0.0f)

  {

	old = atomicExch(address, 0.0f)+old;

  }

  return ret;

};

without return code (use 1 register)

__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);

};

SPWorley · January 29, 2010, 7:32pm

I think atomic float addition is perhaps the #1 FAQ on the CUDA forums… either that, or “why doesn’t AtomicAdd link?” (Which is because you need teh nvcc compute level switch set.)

Here’s a thread from a year and a half ago with the same “exchange with 0.0, keep swapping in the sum until you get 0.0 back” strategy.

psychocoder · January 30, 2010, 9:54am

Ohh, thx I haven’t found this thread, therefore I had add this.

FangQ · July 29, 2010, 5:10am

I have a update to the atomicAdd without and with return value.

with return code (use 2 register)

__device__ inline float atomicAdd(float* address, float value)

{

  float ret=atomicExch(address, 0.0f);

  float old = ret+value;  

  while ((old = atomicExch(address, old))!=0.0f)

  {

	old = atomicExch(address, 0.0f)+old;

  }

  return ret;

};

without return code (use 1 register)

__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);

};

do these atomicAdd for floats still function on CUDA 3.0 and Fermi? I compiled the above codes with CUDA 3.1 and run the generated binary on GTX 470, I got “invalid device function” error at run-time. But if I use atomicAdd() for compute_20 devices, it worked fine.

MisterAnderson42 · July 29, 2010, 12:39pm

People do know that compute 2.0 added an intrinsic float atomic add, right?

CUDA programming guide 3.1 - B.11.1.1

FangQ · July 29, 2010, 1:44pm

yes, but I want to make my code back-ward compatible with non-Fermi hardware. When running a kernel on non-Fermi cards, can I fall back to the above atomicAdd?

athlonshi · July 29, 2010, 2:42pm

Also works for double precision variables? Thanks

Hi,

we need to build sum of float data in a grid, but cuda has no atomicAdd(float*,float). I have search in this forum but found only one slow solution.

I know that ( A+B )+C != A+(B+C) if all data are float.

So I build this for me, but would give this code to all for solve related problems.

This implementation can change to to mul,sub,div,…

I have run a little test to check the speed of this method, i sum all number from 1.0 till 256000.0, than I do the same with a atomicAdd(int*,int) and show the runtime.

FORUM = implementation in this forum

MY = this implementation (see bottom)

INT = implementation with atomicAdd(int*,int)

MY | FORUM | INT

50ms | 34862ms| 8ms

MY atomicAdd() implementation is 6 times slower as atomicAdd(int).

I call kernel myAdd with this parameter. “out” is a pointer to one float value and “in” is a array with 256000 value ( 1 till 255999)
myAdd<<<100,256>>>(out,in);
A implementation which return old data
__device__ inline float atomicAdd(float* address, float value)

{

  float old = value;  

  float ret=atomicExch(address, 0.0f);

  float new_old=ret+old;

while ((old = atomicExch(address, new_old))!=0.0f)

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  return ret;

}

__global__ void myAdd(T *out,T* a  )

{

   atomicAdd(out,a[blockIdx.x*blockDim.x+threadIdx.x]);

};
A implementation without return (need not so many register)
__device__ inline void atomicAdd(float* address, float value)

{

  float old = value;  

  float new_old;

do

  {

	new_old = atomicExch(address, 0.0f);

	new_old += old;

  }

  while ((old = atomicExch(address, new_old))!=0.0f);

};

FangQ · July 29, 2010, 4:15pm

I am very surprised, for floats, the hacked atomicFloatAdd is faster (~10%) than the native one on GTX 470.

seibert · July 29, 2010, 4:45pm

What is the collision probability in that case? I can imagine that the native atomic float addition could be faster when the probability of two threads accessing the same float is high.

FangQ · July 29, 2010, 11:39pm

I got really confused after reading Section 3.1.4 in the programming guide, and not even sure which version of atomic I am executing.

The following is my atomicadd kernel

__device__ inline void atomicadd(float* address, float value){

#if __CUDA_ARCH__ >= 200 // for Fermi, atomicAdd supports floats

  atomicAdd(address,value);

#elif __CUDA_ARCH__ >= 110

// float-atomic-add from 

// <a target='_blank' rel='noopener noreferrer' href='"http://forums.nvidia.com/index.php?showtopic=158039&view=findpost&p=991561"'>http://forums.nvidia.com/index.php?showtop...st&p=991561</a>

  float old = value;

  while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);

#endif

}

then I compiled with -arch=sm_11, and then I run the generated binary with GTX470 (fermi).

can anyone tell me what happens when I compile the code with sm_11? am I executing the JIT version of the PTX of the second half?

Topic		Replies	Views
AtomicAdd algorithm CUDA Programming and Performance	7	3759	August 25, 2009
Optimizing CUDA kernel with atomics CUDA Programming and Performance	10	213	August 19, 2024
multiple threads writing value to a same variable CUDA Programming and Performance	19	6691	March 20, 2012
float asssociative Debugging error CUDA Programming and Performance	10	2224	April 12, 2010
atomicAdd with float2 no API support, workarounds ? CUDA Programming and Performance	23	5173	January 28, 2021
AtomicAdd with Visual Studio 2013 CUDA Setup and Installation	11	5489	February 26, 2015
atomicExch(float) does not work - is it a bug? CUDA Programming and Performance	8	5099	June 27, 2011
Atomic float operations. especially add CUDA Programming and Performance	10	16383	July 31, 2009
Speeding up kernel with atomicAdd and results do not match CUDA Programming and Performance	26	66	July 29, 2024
Is atomicExch() safe for incremental a global float array? CUDA Programming and Performance	13	18224	July 6, 2009

atomicAdd(float*,float) - atomicMul(float*,float) ...

Related topics

atomicAdd(float,float) - atomicMul(float,float) ...