What I am doing wrong with atomicAdd()

Hi,

somehow I cannot figure out what I did wrong in my small algorithm that I am running on GTX 470 with -arch=sm_20 (linux64 bit platform)

int __shared__ AtomicPosition; // it can be also in global memory, the behavior is the same

int __shared__ HashInt[512];

if(threadIdx.x==0)

  AtomicPosition=0;

__syncthreads();

if(... some condition...)

  HashInt[atomicAdd(&AtomicPosition, 1)]=threadIdx.x;

__syncthreads();

what I want to achieve is to fill HashInt array with indexes of threads, that meet true condition in the “if” statement, and AtomicPosition should correspond to the total amount of filled entries in HashInt.

Actually I am always getting some garbage in the array, so, it seems that I cannot understand this atomic function correctly.

Please, advise me what I did wrong in this example?

Sincerely,

IBR

Hi,

somehow I cannot figure out what I did wrong in my small algorithm that I am running on GTX 470 with -arch=sm_20 (linux64 bit platform)

int __shared__ AtomicPosition; // it can be also in global memory, the behavior is the same

int __shared__ HashInt[512];

if(threadIdx.x==0)

  AtomicPosition=0;

__syncthreads();

if(... some condition...)

  HashInt[atomicAdd(&AtomicPosition, 1)]=threadIdx.x;

__syncthreads();

what I want to achieve is to fill HashInt array with indexes of threads, that meet true condition in the “if” statement, and AtomicPosition should correspond to the total amount of filled entries in HashInt.

Actually I am always getting some garbage in the array, so, it seems that I cannot understand this atomic function correctly.

Please, advise me what I did wrong in this example?

Sincerely,

IBR

Have you tried splitting your line into two with the result of the AtomicAdd going into a temporary variable ?
I know it should be the same.

— to save people looking it up here is the blurb on AtomicAdd from Nvidia cuda programming guide V3.0—
atomicAdd()
int atomicAdd(int* address, int val);
unsigned int atomicAdd(unsigned int* address, unsigned int val);
unsigned long long int atomicAdd(unsigned long long int* address, unsigned long long int val);
float atomicAdd(float* address, float val);

reads the 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val), and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The function returns old.

Have you tried splitting your line into two with the result of the AtomicAdd going into a temporary variable ?
I know it should be the same.

— to save people looking it up here is the blurb on AtomicAdd from Nvidia cuda programming guide V3.0—
atomicAdd()
int atomicAdd(int* address, int val);
unsigned int atomicAdd(unsigned int* address, unsigned int val);
unsigned long long int atomicAdd(unsigned long long int* address, unsigned long long int val);
float atomicAdd(float* address, float val);

reads the 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val), and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The function returns old.

I write a small program to check your problem on GTX480, cuda 3.2, the result is correct.

__global__ void testAtomicAdd(int *output)

{

	int __shared__ AtomicPosition; // it can be also in global memory, the behavior is the same

	int __shared__ HashInt[512];

	if(threadIdx.x==0)

		AtomicPosition=0;

	__syncthreads();

	if( threadIdx.x % 2 ){

		HashInt[atomicAdd(&AtomicPosition, 1)]=threadIdx.x;

	}

	__syncthreads();

	if ( threadIdx.x < AtomicPosition ){

		output[threadIdx.x] = HashInt[ threadIdx.x ];

	}

}

#define  M  64

int main()

{

	int output[M];

	int *d_out;

	cudaMalloc((void**)&d_out, M * sizeof(int));

	for(int i = 0; i < M; i++) {

		output[i] = -1;

	}

	cudaMemcpy(d_out, output, M * sizeof(int), cudaMemcpyHostToDevice);

	testAtomicAdd<<<1, M >>>( d_out );

	cudaMemcpy( output, d_out, M * sizeof(int), cudaMemcpyDeviceToHost);

	for(int i = 0; i < M; i++) {

		printf("output[%d] = %d\n", i, output[i] );

	}

	return 0;

}

The result is

output[0] = 1

output[1] = 33

output[2] = 35

output[3] = 3

output[4] = 37

output[5] = 5

output[6] = 39

output[7] = 7

output[8] = 41

output[9] = 9

output[10] = 43

output[11] = 11

output[12] = 45

output[13] = 13

output[14] = 47

output[15] = 15

output[16] = 49

output[17] = 17

output[18] = 51

output[19] = 19

output[20] = 53

output[21] = 21

output[22] = 55

output[23] = 23

output[24] = 57

output[25] = 25

output[26] = 59

output[27] = 27

output[28] = 61

output[29] = 29

output[30] = 63

output[31] = 31

output[32] = -1

output[33] = -1

output[34] = -1

output[35] = -1

output[36] = -1

output[37] = -1

output[38] = -1

output[39] = -1

output[40] = -1

output[41] = -1

output[42] = -1

output[43] = -1

output[44] = -1

output[45] = -1

output[46] = -1

output[47] = -1

output[48] = -1

output[49] = -1

output[50] = -1

output[51] = -1

output[52] = -1

output[53] = -1

output[54] = -1

output[55] = -1

output[56] = -1

output[57] = -1

output[58] = -1

output[59] = -1

output[60] = -1

output[61] = -1

output[62] = -1

output[63] = -1

Could you describe your garbage value?

I write a small program to check your problem on GTX480, cuda 3.2, the result is correct.

__global__ void testAtomicAdd(int *output)

{

	int __shared__ AtomicPosition; // it can be also in global memory, the behavior is the same

	int __shared__ HashInt[512];

	if(threadIdx.x==0)

		AtomicPosition=0;

	__syncthreads();

	if( threadIdx.x % 2 ){

		HashInt[atomicAdd(&AtomicPosition, 1)]=threadIdx.x;

	}

	__syncthreads();

	if ( threadIdx.x < AtomicPosition ){

		output[threadIdx.x] = HashInt[ threadIdx.x ];

	}

}

#define  M  64

int main()

{

	int output[M];

	int *d_out;

	cudaMalloc((void**)&d_out, M * sizeof(int));

	for(int i = 0; i < M; i++) {

		output[i] = -1;

	}

	cudaMemcpy(d_out, output, M * sizeof(int), cudaMemcpyHostToDevice);

	testAtomicAdd<<<1, M >>>( d_out );

	cudaMemcpy( output, d_out, M * sizeof(int), cudaMemcpyDeviceToHost);

	for(int i = 0; i < M; i++) {

		printf("output[%d] = %d\n", i, output[i] );

	}

	return 0;

}

The result is

output[0] = 1

output[1] = 33

output[2] = 35

output[3] = 3

output[4] = 37

output[5] = 5

output[6] = 39

output[7] = 7

output[8] = 41

output[9] = 9

output[10] = 43

output[11] = 11

output[12] = 45

output[13] = 13

output[14] = 47

output[15] = 15

output[16] = 49

output[17] = 17

output[18] = 51

output[19] = 19

output[20] = 53

output[21] = 21

output[22] = 55

output[23] = 23

output[24] = 57

output[25] = 25

output[26] = 59

output[27] = 27

output[28] = 61

output[29] = 29

output[30] = 63

output[31] = 31

output[32] = -1

output[33] = -1

output[34] = -1

output[35] = -1

output[36] = -1

output[37] = -1

output[38] = -1

output[39] = -1

output[40] = -1

output[41] = -1

output[42] = -1

output[43] = -1

output[44] = -1

output[45] = -1

output[46] = -1

output[47] = -1

output[48] = -1

output[49] = -1

output[50] = -1

output[51] = -1

output[52] = -1

output[53] = -1

output[54] = -1

output[55] = -1

output[56] = -1

output[57] = -1

output[58] = -1

output[59] = -1

output[60] = -1

output[61] = -1

output[62] = -1

output[63] = -1

Could you describe your garbage value?