Why does the atomicAdd work faster than ‘+=’?

Hi!

Why does the atomicAdd work faster than ‘+=’?

I compared the operation speeds of atomicAdd and ‘+=’ operator.

So it was known that atomicAdd works faster by 70% though I have expected another result as everybody knows that atomic operations are slower.

Here is the code.

const int SIZE_MEMORY_DY = 1;

const int SIZE_MEMORY_DX = 32;

__global__ void  utilitiesKernel(

	float* dst,

	size_t dstPitchElements, 

	const int sizeY, 

	const int sizeX 

) {

	// Block index

	int bx = blockIdx.x;

	int by = blockIdx.y;

	// Thread index

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	

	int i = bx * SIZE_MEMORY_DX + tx;

	int j = by * SIZE_MEMORY_DY + ty;

	if (i < sizeX && j < sizeY) {

		//Atomic operations are used.

		atomicAdd(&dst[j * dstPitchElements + i], 1);

		//Usual operation are used.

		//dst[j * dstPitchElements + i] += 1;

	}

}

dim3 threads(32, 1);

dim3 grid((sizeX + SIZE_MEMORY_DX - 1) / SIZE_MEMORY_DX, (sizeY + SIZE_MEMORY_DY - 1) / SIZE_MEMORY_DY);

}

How can it be explained?

What hardware are you using? My guess is that the atomicAdd is only performing one global-memory transaction, while the += performs two.

Hardware: Intel® Core™ i3-2100 CPU @3.10GHz, RAM 4 Gb, GeForce GTX 580, Windows 7 64 bit.

+= is performing two global-memory transaction according to the Profiler.

Code +=.

#include <stdio.h>

#include <string.h>

#include <iostream>

using namespace std;

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void utilities(float* dst, int sizeY, int sizeX);

const int SIZE_MEMORY_DY = 1;

const int SIZE_MEMORY_DX = 32;

__global__ void  utilitiesKernel(float* dst, size_t dstPitchElements, const int sizeY, const int sizeX) {

	// Block index

	int bx = blockIdx.x;

	int by = blockIdx.y;

	// Thread index

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	int i = bx * SIZE_MEMORY_DX + tx;

	int j = by * SIZE_MEMORY_DY + ty;

	if (i < sizeX && j < sizeY) {

		//Usual operatoin are used.

		dst[j * dstPitchElements + i] += 1;

	}

	

}

int main(int argc, char** argv)

{	

	const int sizeY = 5000;

	const int sizeX = 5000;

	

	// allocate host memory for matrices A

	unsigned int size_src =  sizeY * sizeX;

	unsigned int mem_size_src = sizeof(float) * size_src;

	float* h_dst = (float*)malloc(mem_size_src);

	

	for (int j = 0; j < sizeY; j++) {

		for (int i = 0; i < sizeX; i++) {

			h_dst[j * sizeX + i] = 0;

		}

	}

	utilities(h_dst, sizeY, sizeX);

	free(h_dst);

	return 0;

}

void utilities(float* dst, int sizeY, int sizeX) {

	// allocate device memory

	float* d_dst;

	size_t dst_pitch_bytes;

	cudaMallocPitch((void**) &d_dst, &dst_pitch_bytes, sizeX * sizeof(float), sizeY);

	size_t dst_pitch_elements = dst_pitch_bytes / sizeof(float);

	//copy host memory to device

	cudaMemcpy2D(d_dst, dst_pitch_bytes, dst, sizeX * sizeof(float), sizeX * sizeof(float), sizeY, cudaMemcpyHostToDevice);

	

	dim3 threads(32, 1);

	dim3 grid((sizeX + SIZE_MEMORY_DX - 1) / SIZE_MEMORY_DX, (sizeY + SIZE_MEMORY_DY - 1) / SIZE_MEMORY_DY);

	utilitiesKernel<<<grid, threads>>>(d_dst, dst_pitch_elements, sizeY, sizeX);

	cudaThreadSynchronize();

	

	//copy device memory to host

	cudaMemcpy2D(dst, sizeX * sizeof(float), d_dst, dst_pitch_bytes, sizeX * sizeof(float), sizeY, cudaMemcpyDeviceToHost);

	

	cudaFree(d_dst);

}

Code atomicAdd.

#include <stdio.h>

#include <string.h>

#include <iostream>

using namespace std;

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void utilities(float* dst, int sizeY, int sizeX);

const int SIZE_MEMORY_DY = 1;

const int SIZE_MEMORY_DX = 32;

__global__ void  utilitiesKernel(float* dst, size_t dstPitchElements, const int sizeY, const int sizeX) {

	// Block index

	int bx = blockIdx.x;

	int by = blockIdx.y;

	// Thread index

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	int i = bx * SIZE_MEMORY_DX + tx;

	int j = by * SIZE_MEMORY_DY + ty;

	if (i < sizeX && j < sizeY) {

		//Atomic operations are used.

		atomicAdd(&dst[j * dstPitchElements + i], 1);

	}

	

}

int main(int argc, char** argv)

{	

	const int sizeY = 5000;

	const int sizeX = 5000;

	

	// allocate host memory for matrices A

	unsigned int size_src =  sizeY * sizeX;

	unsigned int mem_size_src = sizeof(float) * size_src;

	float* h_dst = (float*)malloc(mem_size_src);

	

	for (int j = 0; j < sizeY; j++) {

		for (int i = 0; i < sizeX; i++) {

			h_dst[j * sizeX + i] = 0;

		}

	}

	utilities(h_dst, sizeY, sizeX);

	free(h_dst);

	return 0;

}

void utilities(float* dst, int sizeY, int sizeX) {

	// allocate device memory

	float* d_dst;

	size_t dst_pitch_bytes;

	cudaMallocPitch((void**) &d_dst, &dst_pitch_bytes, sizeX * sizeof(float), sizeY);

	size_t dst_pitch_elements = dst_pitch_bytes / sizeof(float);

	//copy host memory to device

	cudaMemcpy2D(d_dst, dst_pitch_bytes, dst, sizeX * sizeof(float), sizeX * sizeof(float), sizeY, cudaMemcpyHostToDevice);

	

	dim3 threads(32, 1);

	dim3 grid((sizeX + SIZE_MEMORY_DX - 1) / SIZE_MEMORY_DX, (sizeY + SIZE_MEMORY_DY - 1) / SIZE_MEMORY_DY);

	utilitiesKernel<<<grid, threads>>>(d_dst, dst_pitch_elements, sizeY, sizeX);

	cudaThreadSynchronize();

	

	//copy device memory to host

	cudaMemcpy2D(dst, sizeX * sizeof(float), d_dst, dst_pitch_bytes, sizeX * sizeof(float), sizeY, cudaMemcpyDeviceToHost);

	

	cudaFree(d_dst);

}

+= works 4.7 mc.

atomicAdd works 2.7 mc.

Atomic operations are performed directly inside the memory controller, so they are a lot closer to and tighter coupled with L2 cache and global memory.